#-----------------------------------------------
# This file adds variables we use in the book.
# The variables are added to "Master_plus_Snow.csv"
#-----------------------------------------------


setwd("C:/Users/ac6037/Dropbox/TARGETING/replication2018/ANALYSIS OF MASTER DATA")

library(readr)
dat <- read_csv("Master_plus_Snow.csv", locale = locale(encoding = "SHIFT-JIS"))
dim(dat)
# 105353    440

options(max.print=1000000)

# Make an id column, so can merge whatever pruned data frame we make back into
# dat

dat$id <- seq.int(nrow(dat))









# ---------------------------------------
# PART 1. 
# VARIABLES NEEDED FOR MUN-LEVEL TURNOUT ANALYSES

# ---------------------------------------


# -------------------------------
# 
# district_year and district-reform
#
#---------------------------------

elec.dat <- dat[!is.na(dat$hor_electoral_district),]

# Make district_year variable (for district year FE):
elec.dat$district_year <- paste(elec.dat$year, elec.dat$hor_electoral_district, sep="_")
head(elec.dat$district_year)
class(elec.dat$district_year)

# Make district_reform variable (for district FE - some under SNTV; others under SSD):
elec.dat$district_reform <- paste(elec.dat$reform, elec.dat$hor_electoral_district, sep="_")
head(elec.dat$district_reform)
class(elec.dat$district_reform)







# -----------------------------------------------------
#
# Create a lagged turnout variable
#
#---------------------------------

# Reduce dataset to id, year, muncode_num and turnout,
myvars <- c("id", "year", "muncode_num", "mun_turnout")
dat2 <- elec.dat[myvars]

# Restrict dataset to municpality-years with NEEDs data (because we need demographic and fiscal vars):
dat3 <- dat2[!is.na(dat2$muncode_num),]
sum(is.na(dat3$mun_turnout))

# Order by muncode_num first, and then year:
dat4 <- dat3[order(dat3[,3], dat3[,2], decreasing = F),]
dat4[1:50,]

# Lets make sure there is only one muncode_num per year
dat4$check <- paste(dat4$muncode_num, dat4$year, sep="_")
length(dat4$check) - length(unique(dat4$check)) # 0 duplicated observations
dat4$check <- NULL # delete column

# Create a lagged turnout variable:
dat4$time <- dat4$year
library(data.table)
dat5 <- as.data.table(dat4)

dat5 <- dat5 [, Lmun_turnout:=shift(mun_turnout, 1:2, type = "lag"), by = muncode_num]
# Can check here:
dat5[1:50,]

# We made these variables on a subset of data. Lets merge this back into elec.dat:
names(dat5)
# Restrict to id plus new var:
dat6 <- dat5[, c(1, 6)]

# Lets attach these new variables onto elec.dat:
dta <- merge(elec.dat, dat6, by="id", all=T)

elec.dat <- dta








# ------------------------------------
# Create Herfindahl indexes capturing the degree to which each municipality
# concentrated its votes on a single candidate or spread their votes out across a number
# of candidates, for different sets of candidates:

# -----------------------------------------------
# (a) vs_conc_c 

# An index capturing the degree to which municipality m
# concentrated its votes on a single candidate, versus spread them out evenly across all the candidates who ran in  
# district.
# (Spreading them out == 0; concentrating them == 1). 

# For each candidate 1 ... 17, Vshare1, Vshare2, Vshare3, etc. is the share of votes cast that went to all 17 candidates
# (cand_01_votes/mun_voted, etc.).

vs_conc_c <- c()

for(i in 1:nrow(elec.dat)){
  dset <- elec.dat[i,]
  # first, calculate sum of squares
  vshare <- sum(dset$Vshare1^2,
                dset$Vshare2^2,
                dset$Vshare3^2,
                dset$Vshare4^2,
                dset$Vshare5^2, 
                dset$Vshare6^2, 
                dset$Vshare7^2,
                dset$Vshare8^2,
                dset$Vshare9^2,
                dset$Vshare10^2,
                dset$Vshare11^2, 
                dset$Vshare12^2, 
                dset$Vshare13^2,
                dset$Vshare14^2,
                dset$Vshare15^2,
                dset$Vshare16^2,
                dset$Vshare17^2, na.rm=T)
  
  # This index is not in and of itself comparable across districts because the number of candidates 
  # running in each district differs.
  # If municipality m concentrates its votes on a single cand, vshare=1. But if it divides its votes 
  # evenly among five candidates, vshare=0.2, or among four candidates, vshare=0.25, etc.  Even though 
  # municipality m divided its votes evenly among the candidates running, the municipality with 
  # four candidates is recorded as having a higher concentration (0.35 vs. 0.2).  
  # We can take each municipality's score and standardize them according to the number of candidates running,
  # such that municipalities that evenly divide their votes across n candidates get 0 and those that concentrate 
  # their votes on a single candidate get 1, and municipalities where a single candidate runs in the district get NA.
  # To do this:
  
  vshare2 <- (vshare-1/dset$ncands_electoral_district)/(1-1/dset$ncands_electoral_district)
  # This line of code makes the above two municipalities both get 0 for evenly dividing their votes among
  # all candidates running
  
  vs_conc_c <- c(vs_conc_c, vshare2)
  #cat("done",i,"of",nrow(elec.dat),"\n")
}
rm(dset, vshare, vshare2)
length(vs_conc_c)
elec.dat$vs_conc_c <- vs_conc_c                       
range(elec.dat$vs_conc_c, exclude = NULL)








# -----------------------------------------------
# (b)  vs_conc_w
# An index capturing the degree to which municipality m concentrated its votes on a single winner.

vs_conc_w <- c()

for(i in 1:nrow(elec.dat)){
  dset <- elec.dat[i,]
  # first, calculate sum of squares
  vshare <- sum(dset$win_1*dset$Vshare1^2,
                dset$win_2*dset$Vshare2^2,
                dset$win_3*dset$Vshare3^2,
                dset$win_4*dset$Vshare4^2,
                dset$win_5*dset$Vshare5^2, 
                dset$win_6*dset$Vshare6^2, na.rm=T)
  vs_conc_w <- c(vs_conc_w, vshare)
  #cat("done",i,"of",nrow(elec.dat),"\n")
  
}
rm(dset, vshare)
length(vs_conc_w)
elec.dat$vs_conc_w <- vs_conc_w                       
range(elec.dat$vs_conc_w, exclude = NULL)











# --------------------------------------------
# (c) vs_conc_wldp, new.rank.vs_conc_wldp, rel.rank.vs_conc_wldp

# An index capturing the degree to which municipality m concentrated its votes on a single LDP winner:

vs_conc_wldp <- c()

for(i in 1:nrow(elec.dat)){
  dset <- elec.dat[i,]
  # first, calculate sum of squares
  vshare <- sum(dset$LDP_1*dset$win_1*dset$Vshare1^2,
                dset$LDP_2*dset$win_2*dset$Vshare2^2,
                dset$LDP_3*dset$win_3*dset$Vshare3^2,
                dset$LDP_4*dset$win_4*dset$Vshare4^2,
                dset$LDP_5*dset$win_5*dset$Vshare5^2, 
                dset$LDP_6*dset$win_6*dset$Vshare6^2, na.rm=T)
  vs_conc_wldp <- c(vs_conc_wldp, vshare)
  #cat("done",i,"of",nrow(elec.dat),"\n")
}
rm(dset, vshare)
length(vs_conc_wldp)
elec.dat$vs_conc_wldp <- vs_conc_wldp                       
range(elec.dat$vs_conc_wldp, exclude = NULL)

# NB: when there are no LDP winners in a district, municipalities have vs_conc_wldp == 0
check <- subset(elec.dat, elec.dat$year==2003 & elec.dat$cand_01_pty=="DPJ")
sum(check$vs_conc_wldp)
# 0

# NB: when there are no LDP candidates in a district, municipalities have vs_conc_wldp == 0
check <- subset(elec.dat, elec.dat$VotesLDP==0)
sum(check$vs_conc_wldp)
# 0

# Now, we will use each municipality's vs_conc_wldp score to rank municipalities in each district-year
# according to the degree to which they concentrated their votes on a single LDP winner.

elec.dat <- transform(elec.dat, rank.vs_conc_wldp = ave(vs_conc_wldp, district_year,
                                                        FUN = function(x) rank(x, ties.method="average")))

# This assigns higher ranks to municipalities with higher vs_conc_wldp within the district,
# meaning that municipalities with more concentrated vote shares receive higher numbers 
# (if we want the opposite, use rank(-x)).
# e.g. the municipality with the most concentrated vote share for LDP winners in a district-year 
# with 22 municipalities receives rank==22.

# When no LDP candidate runs, all municipalities have vs_conc_wldp is 0.
# When no LDP candidate wins, all municipalities have vs_conc_wldp is 0.
# When there is only one municipality in a district-year, it receives rank = 1.
# NB: we get rid of these when we standardize ranks)

# But because they are treated as ties, they are assigned the same rank.  
# We need to change their rank.vs_conc_wldp to NA to reflect the fact that rankings cannot be 
# constructed when no candidate fits the criteria.

check <- subset(elec.dat, elec.dat$year==1980 & elec.dat$hor_electoral_district==101)
nrow(check)
check[, c("muncode_num", "district_year", "vs_conc_wldp", "rank.vs_conc_wldp")]

check <- subset(elec.dat, elec.dat$year==1996 & elec.dat$hor_electoral_district==101)
nrow(check)
check[, c("muncode_num", "district_year", "vs_conc_wldp", "rank.vs_conc_wldp")]

check <- subset(elec.dat, elec.dat$VotesLDP==0)
check$vs_conc_wldp
check$rank.vs_conc_wldp

check <- subset(elec.dat, elec.dat$district_year=="2014_110")
check$vs_conc_wldp
check$rank.vs_conc_wldp

# Pull the year_districts in which all municipality-years received 0 on vs_conc_wldp
# (either no LDP winner or no LDP candidate), and store them in a vector called all_same:

year_dists <- unique(elec.dat$district_year)
all_same <- c() 
for(j in 1:length(year_dists)){
  d <- elec.dat[elec.dat$district_year==year_dists[j],]
  if(all(d$vs_conc_wldp==0)){
    all_same <- c(all_same, unique(d$district_year))
  }
}

# How many district_years in all_same?
length(all_same)
# 798

# Create a new version of the "rank" variable that receives NA when municipalities are in districts
# WITHOUT LDP winners:
elec.dat$new.rank.vs_conc_wldp <- elec.dat$rank.vs_conc_wldp
elec.dat$new.rank.vs_conc_wldp <- ifelse(elec.dat$district_year %in% all_same, NA, elec.dat$new.rank.vs_conc_wldp)

# Check:
ex <- elec.dat[is.na(elec.dat$new.rank.vs_conc_wldp),]
sum(ex$vs_conc_wldp) # 0

check <- subset(elec.dat, elec.dat$year==1996 & elec.dat$hor_electoral_district==101)
check[, c("muncode_num", "district_year", "vs_conc_wldp", "new.rank.vs_conc_wldp")]

check <- subset(elec.dat, elec.dat$VotesLDP==0)
nrow(check)
sum(is.na(check$new.rank.vs_conc_wldp))

check <- subset(elec.dat, elec.dat$district_year=="2014_110")
check$vs_conc_wldp
check$new.rank.vs_conc_wldp

check <- subset(elec.dat, elec.dat$num=="1" & elec.dat$vs_conc_wldp!=0)
check[, c("new.rank.vs_conc_wldp")]
check[, c("new.rank.vs_conc_wldp")]

# So that municipalities with different numbers of municipalities in their districts
# can be compared to each other, let us take new.rank.vs_conc_wldp (which is an absolute rank) 
# and standardize it across districts, so that 1 is the municipality ranking highest on this 
# in a district (with the vote share that is most concentrated on LDP winners) and 0 is the 
# lowest-ranked municipality in the district.  We want to do this for every municipality-year
# for which its ranked variable "(new.rank~") is not NA.
# We use this formula: (absolute rank-1)/(total number of munis-1) 

# For the municipality-years in which the ranked variable "(new.rank") *is* NA,
# the new variable can be assigned "NA". 

rel.rank.vs_conc_wldp <- c()

for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  if(!is.na(d$new.rank.vs_conc_wldp)){
    rel.rank <- (d$new.rank.vs_conc_wldp-1)/(d$num-1)}else 
      rel.rank <- NA
    rel.rank.vs_conc_wldp <- c(rel.rank.vs_conc_wldp, rel.rank)    
}
length(rel.rank.vs_conc_wldp) == nrow(elec.dat)
elec.dat$rel.rank.vs_conc_wldp <- rel.rank.vs_conc_wldp

# Check:
check <- subset(elec.dat, elec.dat$year==1986 & elec.dat$hor_electoral_district==101)
nrow(check)
check[, c("muncode_num", "district_year", "vs_conc_wldp", "new.rank.vs_conc_wldp", "rel.rank.vs_conc_wldp")]


# Check:
# How many NAs:
sum(is.na(elec.dat$rel.rank.vs_conc_wldp)) # 6171
# How many Nans?
sum(is.nan(elec.dat$rel.rank.vs_conc_wldp)) 
# 94

# These should all be municipality-years for which num is 1.
ex <- elec.dat[is.nan(elec.dat$rel.rank.vs_conc_wldp),]
table(ex$num) 
## yes, all have 1 

# Convert Nan in new column to NA:
elec.dat[sapply(elec.dat, is.nan)] <- NA










# -------------------------------------
# (d) LDP_Vshare_c (for placebo tests)
# Lets calculate the vote share captured by LDP candidates in the municipality, regardless of whether
# they won or lost:

LDP_Vshare_c <- c()
for(i in 1:nrow(elec.dat)){
  dset <- elec.dat[i,]
  vshare <- sum(dset$LDP_1*dset$Vshare1,
                dset$LDP_2*dset$Vshare2,
                dset$LDP_3*dset$Vshare3,
                dset$LDP_4*dset$Vshare4,
                dset$LDP_5*dset$Vshare5,
                dset$LDP_6*dset$Vshare6,
                dset$LDP_7*dset$Vshare7,
                dset$LDP_8*dset$Vshare8,
                dset$LDP_9*dset$Vshare9,
                dset$LDP_10*dset$Vshare10,
                dset$LDP_11*dset$Vshare11,
                dset$LDP_12*dset$Vshare12,
                dset$LDP_13*dset$Vshare13,
                dset$LDP_14*dset$Vshare14,
                dset$LDP_15*dset$Vshare15,
                dset$LDP_16*dset$Vshare16,
                dset$LDP_17*dset$Vshare17, na.rm=T)
  LDP_Vshare_c  <- c(LDP_Vshare_c , vshare)
  #cat("done",i,"of",nrow(elec.dat),"\n")
}
rm(dset, vshare)
length(LDP_Vshare_c)
elec.dat$LDP_Vshare_c <- LDP_Vshare_c                       
range(elec.dat$LDP_Vshare_c, exclude = NULL)
head(elec.dat$LDP_Vshare_c)





# ------------------------------------
# LDP_Vshare_w
# Lets calculate the vote share captured by LDP winners in the municipality: 

LDP_Vshare_w <- c()
for(i in 1:nrow(elec.dat)){
  dset <- elec.dat[i,]
  vshare <- sum(dset$LDP_1*dset$win_1*dset$Vshare1,
                dset$LDP_2*dset$win_2*dset$Vshare2,
                dset$LDP_3*dset$win_3*dset$Vshare3,
                dset$LDP_4*dset$win_4*dset$Vshare4,
                dset$LDP_5*dset$win_5*dset$Vshare5, 
                dset$LDP_6*dset$win_6*dset$Vshare6, na.rm=T)
  LDP_Vshare_w <- c(LDP_Vshare_w, vshare)
  #cat("done",i,"of",nrow(elec.dat),"\n")
}
rm(dset, vshare)
length(LDP_Vshare_w)
elec.dat$LDP_Vshare_w <- LDP_Vshare_w                       
range(elec.dat$LDP_Vshare_w, exclude = NULL)
head(elec.dat$LDP_Vshare_w)








# --------------------------------------------
# (e) vs_conc_wopp

# Lets create indexes capturing the degree to which municipality m concentrated its 
# votes on each opposition party.  Do JSP, NFP, and DPJ

# Candidate 1:
JSP_1 <- c()
NFP_1 <- c()
DPJ_1 <- c()
for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  JSP <- ifelse(d$cand_01_pty == "JSP", 1, 0) 
  NFP <- ifelse(d$cand_01_pty == "NFP", 1, 0) 
  DPJ <- ifelse(d$cand_01_pty == "DPJ", 1, 0) 
  JSP_1 <- c(JSP_1, JSP)
  NFP_1 <- c(NFP_1, NFP)
  DPJ_1 <- c(DPJ_1, DPJ)
  #cat("done",j,"of",nrow(elec.dat),"\n")
  rm(JSP, NFP, DPJ)
}

nrow(elec.dat)
length(JSP_1)
length(DPJ_1)
length(NFP_1)

elec.dat$JSP_1 <- JSP_1
elec.dat$NFP_1 <- NFP_1
elec.dat$DPJ_1 <- DPJ_1

# Candidate 2

JSP_2 <- c()
NFP_2 <- c()
DPJ_2 <- c()
for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  JSP <- ifelse(d$cand_02_pty == "JSP", 1, 0) 
  NFP <- ifelse(d$cand_02_pty == "NFP", 1, 0) 
  DPJ <- ifelse(d$cand_02_pty == "DPJ", 1, 0) 
  JSP_2 <- c(JSP_2, JSP)
  NFP_2 <- c(NFP_2, NFP)
  DPJ_2 <- c(DPJ_2, DPJ)
  #cat("done",j,"of",nrow(elec.dat),"\n")
  rm(JSP, NFP, DPJ)
}

nrow(elec.dat)
length(JSP_2)
length(DPJ_2)
length(NFP_2)

elec.dat$JSP_2 <- JSP_2
elec.dat$NFP_2 <- NFP_2
elec.dat$DPJ_2 <- DPJ_2

# Candidate 3

JSP_3 <- c()
NFP_3 <- c()
DPJ_3 <- c()
for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  JSP <- ifelse(d$cand_03_pty == "JSP", 1, 0) 
  NFP <- ifelse(d$cand_03_pty == "NFP", 1, 0) 
  DPJ <- ifelse(d$cand_03_pty == "DPJ", 1, 0) 
  JSP_3 <- c(JSP_3, JSP)
  NFP_3 <- c(NFP_3, NFP)
  DPJ_3 <- c(DPJ_3, DPJ)
  #cat("done",j,"of",nrow(elec.dat),"\n")
  rm(JSP, NFP, DPJ)
}
nrow(elec.dat)
length(JSP_3)
length(DPJ_3)
length(NFP_3)
elec.dat$JSP_3 <- JSP_3
elec.dat$NFP_3 <- NFP_3
elec.dat$DPJ_3 <- DPJ_3


# Candidate 4

JSP_4 <- c()
NFP_4 <- c()
DPJ_4 <- c()
for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  JSP <- ifelse(d$cand_04_pty == "JSP", 1, 0) 
  NFP <- ifelse(d$cand_04_pty == "NFP", 1, 0) 
  DPJ <- ifelse(d$cand_04_pty == "DPJ", 1, 0) 
  JSP_4 <- c(JSP_4, JSP)
  NFP_4 <- c(NFP_4, NFP)
  DPJ_4 <- c(DPJ_4, DPJ)
  #cat("done",j,"of",nrow(elec.dat),"\n")
  rm(JSP, NFP, DPJ)
}
nrow(elec.dat)
length(JSP_4)
length(DPJ_4)
length(NFP_4)
elec.dat$JSP_4 <- JSP_4
elec.dat$NFP_4 <- NFP_4
elec.dat$DPJ_4 <- DPJ_4

# Candidate5

JSP_5 <- c()
NFP_5 <- c()
DPJ_5 <- c()
for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  JSP <- ifelse(d$cand_05_pty == "JSP", 1, 0) 
  NFP <- ifelse(d$cand_05_pty == "NFP", 1, 0) 
  DPJ <- ifelse(d$cand_05_pty == "DPJ", 1, 0) 
  JSP_5 <- c(JSP_5, JSP)
  NFP_5 <- c(NFP_5, NFP)
  DPJ_5 <- c(DPJ_5, DPJ)
  #cat("done",j,"of",nrow(elec.dat),"\n")
  rm(JSP, NFP, DPJ)
}
nrow(elec.dat)
length(JSP_5)
length(DPJ_5)
length(NFP_5)
elec.dat$JSP_5 <- JSP_5
elec.dat$NFP_5 <- NFP_5
elec.dat$DPJ_5 <- DPJ_5

# Candidate 6

JSP_6 <- c()
NFP_6 <- c()
DPJ_6 <- c()
for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  JSP <- ifelse(d$cand_06_pty == "JSP", 1, 0) 
  NFP <- ifelse(d$cand_06_pty == "NFP", 1, 0) 
  DPJ <- ifelse(d$cand_06_pty == "DPJ", 1, 0) 
  JSP_6 <- c(JSP_6, JSP)
  NFP_6 <- c(NFP_6, NFP)
  DPJ_6 <- c(DPJ_6, DPJ)
  #cat("done",j,"of",nrow(elec.dat),"\n")
  rm(JSP, NFP, DPJ)
}
nrow(elec.dat)
length(JSP_6)
length(DPJ_6)
length(NFP_6)
elec.dat$JSP_6 <- JSP_6
elec.dat$NFP_6 <- NFP_6
elec.dat$DPJ_6 <- DPJ_6

# ---------------------------------
# make an index like vs_conc_wldp above, with JSP

vs_conc_wjsp <- c()

for(i in 1:nrow(elec.dat)){
  dset <- elec.dat[i,]
  # first, calculate sum of squares
  vshare <- sum(dset$JSP_1*dset$win_1*dset$Vshare1^2,
                dset$JSP_2*dset$win_2*dset$Vshare2^2,
                dset$JSP_3*dset$win_3*dset$Vshare3^2,
                dset$JSP_4*dset$win_4*dset$Vshare4^2,
                dset$JSP_5*dset$win_5*dset$Vshare5^2, 
                dset$JSP_6*dset$win_6*dset$Vshare6^2, na.rm=T)
  vs_conc_wjsp <- c(vs_conc_wjsp, vshare)
  #cat("done",i,"of",nrow(elec.dat),"\n")
}
rm(dset, vshare)
length(vs_conc_wjsp)
elec.dat$vs_conc_wjsp <- vs_conc_wjsp                       
range(elec.dat$vs_conc_wjsp, exclude = NULL)

# vs_conc_wjsp should get 0 for all elections after 1993
check <- (subset(elec.dat, elec.dat$year>1995))
table(check$vs_conc_wjsp)


# ---------------------------------
# make an index like vs_conc_wldp above, with DPJ

vs_conc_wdpj <- c()

for(i in 1:nrow(elec.dat)){
  dset <- elec.dat[i,]
  # first, calculate sum of squares
  vshare <- sum(dset$DPJ_1*dset$win_1*dset$Vshare1^2,
                dset$DPJ_2*dset$win_2*dset$Vshare2^2,
                dset$DPJ_3*dset$win_3*dset$Vshare3^2,
                dset$DPJ_4*dset$win_4*dset$Vshare4^2,
                dset$DPJ_5*dset$win_5*dset$Vshare5^2, 
                dset$DPJ_6*dset$win_6*dset$Vshare6^2, na.rm=T)
  vs_conc_wdpj <- c(vs_conc_wdpj, vshare)
  #cat("done",i,"of",nrow(elec.dat),"\n")
}
rm(dset, vshare)
length(vs_conc_wdpj)
elec.dat$vs_conc_wdpj <- vs_conc_wdpj                       
range(elec.dat$vs_conc_wdpj, exclude = NULL)

# NB: when there are no DPJ winners in a district, municipalities have vs_conc_wdpj == 0
check <- subset(elec.dat, elec.dat$year>1995 & elec.dat$cand_02_pty=="DPJ")
sum(check$vs_conc_wdpj)
# 0
# Should get 0 for all elections before 1995
check <- (subset(elec.dat, elec.dat$year<1995))
table(check$vs_conc_wdpj)




# ---------------------------------
# make an index like vs_conc_wldp above, with NFP

vs_conc_wnfp <- c()

for(i in 1:nrow(elec.dat)){
  dset <- elec.dat[i,]
  # first, calculate sum of squares
  vshare <- sum(dset$NFP_1*dset$win_1*dset$Vshare1^2,
                dset$NFP_2*dset$win_2*dset$Vshare2^2,
                dset$NFP_3*dset$win_3*dset$Vshare3^2,
                dset$NFP_4*dset$win_4*dset$Vshare4^2,
                dset$NFP_5*dset$win_5*dset$Vshare5^2, 
                dset$NFP_6*dset$win_6*dset$Vshare6^2, na.rm=T)
  vs_conc_wnfp <- c(vs_conc_wnfp, vshare)
  #cat("done",i,"of",nrow(elec.dat),"\n")
}
rm(dset, vshare)
length(vs_conc_wnfp)
elec.dat$vs_conc_wnfp <- vs_conc_wnfp                       
range(elec.dat$vs_conc_wnfp, exclude = NULL)

# NB: when there are no NFP winners in a district, municipalities have vs_conc_wnfp == 0
check <- subset(elec.dat, elec.dat$year>1995 & elec.dat$cand_02_pty=="NFP")
sum(check$vs_conc_wnfp)
# 0

# Should get 0 for all elections before 1995
check <- (subset(elec.dat, elec.dat$year<1995))
table(check$vs_conc_wnfp)




# ---------------------------------
# make an index capturing the degree to which a municipality concentrates its votes on
# a candidate of the largest opposition party (1980-1993 JSP; 1996 NFP; 2000-2014 DPJ)

elec.dat$vs_conc_wopp <- ifelse(elec.dat$year<1995, elec.dat$vs_conc_wjsp, 
                                ifelse(elec.dat$year==1996, elec.dat$vs_conc_wnfp, elec.dat$vs_conc_wdpj))

elec.dat[1:50, c("year", "cand_01_pty", "vs_conc_wldp", "vs_conc_wjsp", "vs_conc_wnfp", "vs_conc_wdpj", "vs_conc_wopp")]


# -------------------------------
# votes for a single LDP winner/votes cast

# In each municipality-year, identify the largest number of votes that municipality
# supplied to an LDP winner.  Get the LDP winners in the district and the votes
# they received in the municipality, and record the maximum of this.

votes_single_LDPw <- c()

for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  votes <- c()
  if(d$LDP_1==1 & d$win_1==1){
    votes <- d$cand_01_votes
  }
  if(d$LDP_2==1 & d$win_2==1){
    votes <- c(votes, d$cand_02_votes)
  }
  if(d$LDP_3==1 & d$win_3==1){
    votes <- c(votes, d$cand_03_votes)
  }
  if(d$LDP_4==1 & d$win_4==1){
    votes <- c(votes, d$cand_04_votes)
  }
  if(d$LDP_5==1 & d$win_5==1){
    votes <- c(votes, d$cand_05_votes)
  }
  if(d$LDP_6==1 & d$win_6==1){
    votes <- c(votes, d$cand_06_votes)
  }
  if (length(votes) > d$totseat_in_electoral_district){
    break
  }
  if (length(votes)>=1){
    maxvotes <- c(max(votes))} else maxvotes <- 0
  votes_single_LDPw <- c(votes_single_LDPw, maxvotes)
  
  #cat("done",j,"of",nrow(elec.dat),"\n")
}
length(votes_single_LDPw)
elec.dat$votes_single_LDPw <- votes_single_LDPw                       
range(elec.dat$votes_single_LDPw, exclude = NULL)

# check:
# After 1996, when cand_01_pty is not LDP, votes_single_LDPw should be zero
check <- subset(elec.dat, elec.dat$cand_01_pty!="LDP" & elec.dat$year>1995)
table(check$votes_single_LDPw)
# all zeros

elec.dat[1:50, c("votes_single_LDPw", "mun_voted")]

# Calculate votes cast for a single LDP winner/votes cast
elec.dat$vs_single_LDPw <- elec.dat$votes_single_LDPw/elec.dat$mun_voted
range(elec.dat$vs_single_LDPw, exclude = NULL)









# -------------------------------
# votes for all LDP winners/votes cast

# In each municipality-year, identify the number of votes that municipality
# supplied to all LDP winners and sum them.

votes_all_LDPw <- c()

for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  votes <- c()
  if(d$LDP_1==1 & d$win_1==1){
    votes <- d$cand_01_votes
  }
  if(d$LDP_2==1 & d$win_2==1){
    votes <- c(votes, d$cand_02_votes)
  }
  if(d$LDP_3==1 & d$win_3==1){
    votes <- c(votes, d$cand_03_votes)
  }
  if(d$LDP_4==1 & d$win_4==1){
    votes <- c(votes, d$cand_04_votes)
  }
  if(d$LDP_5==1 & d$win_5==1){
    votes <- c(votes, d$cand_05_votes)
  }
  if(d$LDP_6==1 & d$win_6==1){
    votes <- c(votes, d$cand_06_votes)
  }
  if (length(votes) > d$totseat_in_electoral_district){
    break
  }
  if (length(votes)>=1){
    sumvotes <- c(sum(votes))} else sumvotes <- 0
  votes_all_LDPw <- c(votes_all_LDPw, sumvotes)
  
  #cat("done",j,"of",nrow(elec.dat),"\n")
}
length(votes_all_LDPw)
elec.dat$votes_all_LDPw <- votes_all_LDPw                       
range(elec.dat$votes_all_LDPw, exclude = NULL)

# check:
# After 1996, when cand_01_pty is not LDP, votes_all_LDPw should be zero
check <- subset(elec.dat, elec.dat$cand_01_pty!="LDP" & elec.dat$year>1995)
table(check$votes_all_LDPw)
# all zeros

elec.dat[1:50, c("votes_all_LDPw", "mun_voted")]

# Calculate votes cast for all LDP winners/votes cast
elec.dat$vs_all_LDPw <- elec.dat$votes_all_LDPw/elec.dat$mun_voted
range(elec.dat$vs_all_LDPw, exclude = NULL)







# -------------------------------
# votes for all LDP candidates/votes cast

# LDP candidates go up to 9th candidate
# sum(na.omit(elec.dat$cand_09_pty=="LDP"))

votes_all_LDPc <- c()

for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  votes <- c()
  if(d$LDP_1==1){
    votes <- d$cand_01_votes
  }
  if(d$LDP_2==1){
    votes <- c(votes, d$cand_02_votes)
  }
  if(d$LDP_3==1){
    votes <- c(votes, d$cand_03_votes)
  }
  if(d$LDP_4==1){
    votes <- c(votes, d$cand_04_votes)
  }
  if(d$LDP_5==1){
    votes <- c(votes, d$cand_05_votes)
  }
  if(d$LDP_6==1){
    votes <- c(votes, d$cand_06_votes)
  }
  if(d$LDP_7==1){
    votes <- c(votes, d$cand_07_votes)
  }
  if(d$LDP_8==1){
    votes <- c(votes, d$cand_08_votes)
  }
  if(d$LDP_9==1){
    votes <- c(votes, d$cand_09_votes)
  }
  if (length(votes)>=1){
    sumvotes <- c(sum(votes))} else sumvotes <- 0
  votes_all_LDPc <- c(votes_all_LDPc, sumvotes)
  
  #cat("done",j,"of",nrow(elec.dat),"\n")
}

length(votes_all_LDPc)
elec.dat$votes_all_LDPc <- votes_all_LDPc                       
range(elec.dat$votes_all_LDPc, exclude = NULL)

check <- subset(elec.dat, elec.dat$votes_all_LDPc==0)
#check2 <- subset(check, check$year>1993)
check[, c("cand_01_pty", "cand_02_pty", "cand_03_pty", "cand_04_pty", "cand_05_pty")]

elec.dat[1:50, c("votes_all_LDPc", "mun_voted")]

# Calculate votes cast for all LDP cands/votes cast
elec.dat$vs_all_LDPc <- elec.dat$votes_all_LDPc/elec.dat$mun_voted
range(elec.dat$vs_all_LDPc, exclude = NULL)





# -------------------------------
# votes for a single LDP candidate/votes cast

# LDP candidates go up to 9th candidate
# sum(na.omit(elec.dat$cand_09_pty=="LDP"))

votes_single_LDPc <- c()

for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  votes <- c()
  if(d$LDP_1==1){
    votes <- d$cand_01_votes
  }
  if(d$LDP_2==1){
    votes <- c(votes, d$cand_02_votes)
  }
  if(d$LDP_3==1){
    votes <- c(votes, d$cand_03_votes)
  }
  if(d$LDP_4==1){
    votes <- c(votes, d$cand_04_votes)
  }
  if(d$LDP_5==1){
    votes <- c(votes, d$cand_05_votes)
  }
  if(d$LDP_6==1){
    votes <- c(votes, d$cand_06_votes)
  }
  if(d$LDP_7==1){
    votes <- c(votes, d$cand_07_votes)
  }
  if(d$LDP_8==1){
    votes <- c(votes, d$cand_08_votes)
  }
  if(d$LDP_9==1){
    votes <- c(votes, d$cand_09_votes)
  }
  if (length(votes)>=1){
    maxvotes <- c(max(votes))} else maxvotes <- 0
  votes_single_LDPc <- c(votes_single_LDPc, maxvotes)
  
  #cat("done",j,"of",nrow(elec.dat),"\n")
}

length(votes_single_LDPc)
elec.dat$votes_single_LDPc <- votes_single_LDPc                       
range(elec.dat$votes_single_LDPc, exclude = NULL)

check <- subset(elec.dat, elec.dat$votes_single_LDPc==0)
check[, c("cand_01_pty", "cand_02_pty", "cand_03_pty", "cand_04_pty", "cand_05_pty")]

elec.dat[1:50, c("votes_single_LDPc", "mun_voted")]

# Calculate votes cast for single LDP cands/votes cast
elec.dat$vs_single_LDPc <- elec.dat$votes_single_LDPc/elec.dat$mun_voted
range(elec.dat$vs_single_LDPc, exclude = NULL)










# -------------------------------------
# Next, lets create lagged vs_conc_wldp, rel.rank.vs_conc_wldp, vs_single_LDPw, 
# LDP_Vshare_c, and LDP_Vshare_w

# Reduce dataset to id, year, muncode_num etc.
myvars <- c("id", "year", "muncode_num", "vs_conc_wldp", 
            "rel.rank.vs_conc_wldp", "vs_single_LDPw",
            "LDP_Vshare_c", "LDP_Vshare_w")
dat2 <- elec.dat[myvars]

# Restrict dataset to municpality-years with NEEDs data (because we need demographic and fiscal vars):
dat3 <- dat2[!is.na(dat2$muncode_num),]

# Order by muncode_num first, and then year:
dat4 <- dat3[order(dat3[,3], dat3[,2], decreasing = F),]
dat4[1:10,]

# Lets make sure there is only one muncode_num per year
dat4$check <- paste(dat4$muncode_num, dat4$year, sep="_")
length(dat4$check) - length(unique(dat4$check)) # 0 duplicated observations
dat4$check <- NULL # delete column

# Create lagged support variables:
dat4$time <- dat4$year
library(data.table)
dat5 <- as.data.table(dat4)

dat5 <- dat5 [, Lvs_conc_wldp:=shift(vs_conc_wldp, 1:2, type = "lag"), by = muncode_num]
# Can check here:
dat5[1:50,]

dat5 <- dat5 [, Lrel.rank.vs_conc_wldp:=shift(rel.rank.vs_conc_wldp, 1:2, type = "lag"), by = muncode_num]
# Can check here:
dat5[1:50,]

dat5 <- dat5 [, Lvs_single_LDPw:=shift(vs_single_LDPw, 1:2, type = "lag"), by = muncode_num]
# Can check here:
dat5[1:50, c("year", "vs_single_LDPw", "Lvs_single_LDPw")]

dat5 <- dat5 [, LLDP_Vshare_c:=shift(LDP_Vshare_c, 1:2, type = "lag"), by = muncode_num]
# Can check here:
dat5[1:50,]

dat5 <- dat5 [, LLDP_Vshare_w:=shift(LDP_Vshare_w, 1:2, type = "lag"), by = muncode_num]
# Can check here:
dat5[1:50,]

# We made these variables on a subset of data. Lets merge this back into elec.dat:
names(dat5)

# Restrict to id plus new vars:
dat6 <- dat5[, c(1, 10:14)]

# Lets attach these new variables onto dat:
dta <- merge(elec.dat, dat6, by="id", all=T)

elec.dat <- dta











# ------------------------------------
#
# Create margin variables (marginvs and marginvp)
#
# -----------------------------------------------

# Lets use "marginvs" (votes received by cand in Mth position - votes received by candidate in M+1 position, 
# divided by "voted_in_electoral_district") 

# NB: numerator(s) is made by taking the number of votes the candidates in the Mth andM+1th positions
# in the district received in the district.  This was calculated with the JED-M data, and summed all the cand_01_votes
# each candidate got in the municipalities that make up the district.  The denominator ("voted_in_electoral_district") 
# is "ku_totvote" from Reed, which was calculated by summing the "ku_vote" of all candidates who ran.  Thus,
# both numerator and denominator include votes cast in split municipalities.  Thus, margin vars were constructed
# with split munis in them, but vote concentration variables excluded them.  This is appropriate because margin
# variables are district-specific. Split municipalities are not competitors in the tournament.

# Also construct "marginvp" (votes received by cand in Mth position - votes received by candidate in M+1 position, 
# divided by "voting_pop_in_electoral_district" (denominator comes from RS's "ku_electorate", so also includes
# split municipalities).

# working with elec.dat:

years <- unique(elec.dat$year)
all.the.years <- matrix(as.character(NA), ncol=3, nrow=0)
all.the.years <- as.data.frame(all.the.years)
cvotes.vector <- c("cand_01_votes_district", "cand_02_votes_district", "cand_03_votes_district", "cand_04_votes_district",
                   "cand_05_votes_district", "cand_06_votes_district", "cand_07_votes_district")

# for each year:
for(j in 1:length(years)){
  
  # Give me all data for that year:
  year.dset <- elec.dat[elec.dat$year==years[j],]
  
  # Give me all the districts for that year:
  kus <- unique(year.dset$hor_electoral_district)
  
  # Create an empty matrix for that year to put the results in (year, ku, difference):
  tops <- matrix(as.character(NA), ncol=3, nrow=0)
  
  # for each ku in that year
  for(i in 1:length(kus)){
    
    # get that ku:
    subku <- year.dset[year.dset$hor_electoral_district==kus[i],]
    
    # get M in that ku:
    m <- unique(subku$totseat_in_electoral_district)
    
    # Get votes of Mth candidate:
    mth <- cvotes.vector[m]
    mthvotes <- unique(subku[,c(mth)])
    
    # Get votes of M+1th candidate:
    mplus <- cvotes.vector[m+1]
    mplusvotes <- unique(subku[,c(mplus)])
    
    # difference in votes:
    diff.votes <- mthvotes-mplusvotes
    
    # store in a vector:
    #difference <- c(difference, diff.votes)
    
    # Store the results:
    tops <- rbind(tops, cbind(unique(subku$year), 
                              unique(subku$hor_electoral_district), diff.votes))
    
    # done with the kus for this year -- move on to next year
  }
  
  ## store the kus for that year in all.the.years and move to the top to tackle the next year
  
  all.the.years <- rbind(all.the.years, tops)
  
}

colnames(all.the.years) <- c("year", "hor_electoral_district", "difference_M_and_Mplus")
all.the.years$difference_M_and_Mplus <- as.numeric(as.character(all.the.years$difference_M_and_Mplus))
all.the.years$district_year <- paste(all.the.years$year, all.the.years$hor_electoral_district, sep="_")

myvars <- c("difference_M_and_Mplus", "district_year")
tomerge <- all.the.years[myvars]

# check:
e <- tomerge$district_year %in% elec.dat$district_year
table(e) # all true

# Merge this back into elec.dat with year and district:
elec.dat1 <- merge(elec.dat, tomerge, by="district_year")

# Make "marginvs":
elec.dat1$marginvs <- elec.dat1$difference_M_and_Mplus/elec.dat1$voted_in_electoral_district

# Make "marginvp":
elec.dat1$marginvp <- elec.dat1$difference_M_and_Mplus/elec.dat1$voting_pop_in_electoral_district

# Increases in margin indicate decreases in closeness, so lets multiply by -1 to make closeness
# Higher scores indicate greater closeness

elec.dat1$closenessvp <- elec.dat1$marginvp*-1

elec.dat1$closenessvs <- elec.dat1$marginvs*-1

elec.dat <- elec.dat1


# ---------------------------------------------
# Save just the turnout data

saveRDS(elec.dat, file = "Book_Mun_Level_Turnout.rds")





# -----------------------------------------------------
# Merge it back into dat and save entire file out

# New variables we made (and want to keep), plus id, are:

newvars <- c("id", "district_year", "district_reform", "Lmun_turnout",                  
             "vs_conc_c", "vs_conc_w", "vs_conc_wldp", "rel.rank.vs_conc_wldp",          
             "LDP_Vshare_c", "LDP_Vshare_w", "vs_conc_wjsp", "vs_conc_wdpj",
             "vs_conc_wnfp", "vs_conc_wopp", "votes_single_LDPw", "vs_single_LDPw",
             "votes_all_LDPw", "vs_all_LDPw", "votes_all_LDPc", "vs_all_LDPc",  
             "votes_single_LDPc", "vs_single_LDPc", "Lvs_conc_wldp", 
             "Lrel.rank.vs_conc_wldp", "Lvs_single_LDPw", "LLDP_Vshare_c",
             "LLDP_Vshare_w", "difference_M_and_Mplus", "marginvs", 
             "marginvp", "closenessvp", "closenessvs")

elec.dat2 <- elec.dat[newvars]

dat1 <- merge(dat, elec.dat2, by="id", all=T)

saveRDS(dat1, file = "Master_plus_Snow_Turn.rds")











# ---------------------------------------
# PART 2. 
# VARIABLES NEEDED FOR MUN-LEVEL TRANSFERS ANALYSES

# ---------------------------------------

dat <- readRDS("Master_plus_Snow_Turn.rds")




# ----------------------------------------------
# Let us remake all the lag and lead variables using muncode_num (not code)
# ----------------------------------------------

# (We make Lngaid_pc below).

# Llogngaid_pc
# F1logngaid_pc
# F1ngaid_pc
# ~ .... lagged and lead financial variables need to be remade:

dat$Llogngaid_pc <- NULL
dat$F1logngaid_pc <- NULL
dat$F1ngaid_pc <- NULL

# Reduce dataset to id, year, muncode_num, ngaid_pc, and logngaid_pc,
myvars <- c("id", "year", "muncode_num", "ngaid_pc", "logngaid_pc")
dat2 <- dat[myvars]

# Restrict dataset to municpality-years with NEEDs data:
dat3 <- dat2[!is.na(dat2$muncode_num),]

# Further restrict to municipalities where mun_ngaid is populated:
dat4 <- dat3[!is.na(dat3$ngaid_pc),]

# They all have a mun_code_num, so lets order by muncode_num first, and then year:
dat5 <- dat4[order(dat4[,3], dat4[,2], decreasing = F),]
dat5[1:100,]

# Lets make sure there is only one muncode_num per year
dat5$check <- paste(dat5$muncode_num, dat5$year, sep="_")
length(dat5$check) - length(unique(dat5$check)) # 0 duplicated observations
dat5$check <- NULL # delete column

# Create a lead ngaid_pc variable (a municipality-year's ngaid_pc score in the subsequent fiscal year):
dat5$time <- dat5$year
dat5[1:100,]

library(data.table)
dat6 <- as.data.table(dat5)

dat6 <- dat6 [, F1ngaid_pc:=shift(ngaid_pc, 1:2, type = "lead"), by = muncode_num]
# Can check here:
# (NB: we don't have subsidy data in 2016, so municipality-years in 2015 will have NA for F1ngaid_pc)
dat6[1:100, ]

# Create lead logngaid_pc variable:
dat6 <- dat6 [, F1logngaid_pc:=shift(logngaid_pc, 1:2, type = "lead"), by = muncode_num]
# Can check here:
dat6[1:50,]

# Create lagged logngaid_pc variable (Llogngaid_pc):

dat6 <- dat6 [, Llogngaid_pc:=shift(logngaid_pc, 1:2, type = "lag"), by = muncode_num]
# Can check here:
dat6[1:50,]



# We made these variables on a subset of data. Lets merge this back into dat:

# Lets restrict dat6 to "id" plus F1ngaid_pc, F1logngaid_pc, and Llogngaid_pc
names(dat6)
# [1] "id"            "year"          "muncode_num"   "ngaid_pc"      "logngaid_pc"  
# "time"          "F1ngaid_pc"    "F1logngaid_pc"
# [9] "Llogngaid_pc"

smalldat <- dat6[, c(1, 7, 8, 9)]

# Lets attach these new variables onto dat:
dta <- merge(dat, smalldat, by="id", all=T)

dat <- dta




library(dplyr)
library(readstata13)
library(stargazer)
library(lmtest)
library(plm)
library(clubSandwich)
library(apsrtable)

# -----------------------------------------------
# There is further redistricting under MMM that was not
# taken into account with hor_electoral_district and district_reform.

# ----------------------------------
# Subset to electoral data (later, we can merge this back into dat):

elec.dat <- dat[!is.na(dat$hor_electoral_district),]

# ----------------------------------
# Lets read in Reed-Smith data and attach kucoder onto elec.dat:

dta <- read.csv("Reed-Smith-JHRED.csv", fileEncoding = "SHIFT-JIS")
dta2 <- subset(dta, dta$year>1979 & dta$byelection==0 & dta$kucode!=0)
table(dta2$kucode)
dta3 <- dta2[, c("year", "kucode", "kucoder")]
dta4 <- dta3[!duplicated(dta3),]
table(dta4$kucoder)

# Let us recode year==1990 as year==1989
table(dta4$year)
dta4$year.new <- ifelse(dta4$year==1990, 1989, dta4$year)
table(dta4$year)
table(dta4$year.new)

# Create a district_year var to match the one in elec.dat
dta4$district_year <- paste(dta4$year.new, dta4$kucode, sep="_")

# Pare the dataset down to what I need:
dta5 <- dta4[, c("district_year", "kucoder")]

# Lets make sure there are no duplicated district_years, which happen when not all candidates in a district-year
# receive the same value for kucoder (due to error in reed smith):
check <- dta5[duplicated(dta5$district_year),] # there are 5 district_years:
# Lets take a look at these:
check2 <- dta5[dta5$district_year %in% check$district_year,]
nrow(check2)
# I consulted with Reed Smith, and all the kucoder scores in this data are supposed to be the .1s, not the 0s. 
# So let us get those without 1s, and delete them from dta5
check3 <- check2[c(2, 4, 6, 8, 10),]
check3$toelim <- paste(check3$district_year, check3$kucoder, sep="_")

# Lets eliminate these from dta5:
dta5$toelim <- paste(dta5$district_year, dta5$kucoder, sep="_")
nrow(dta5)
dta6 <- dta5[!(dta5$toelim %in% check3$toelim),]
nrow(dta6) # yes, eliminated all five
dta6$toelim <- NULL

# Which district_years in Reed Smith didn't make it into elec_dat?
check <- dta6[!dta6$district_year %in% elec.dat$district_year,]
length(unique(check$district_year))
# 107 district_years
# These are comprised solely of municipalities that are split across districts, such Chiba 6 in 2003
# (hor_electoral_district=1206).  There are only four districts excluded in the pre-ER period,
# and these are the Amami Islands (1980_4604).

# Which district_years in elec.dat are not in Reed Smith?
check2 <- elec.dat[!elec.dat$district_year %in% dta6$district_year,]
nrow(check2)
# 0 ( they're all in)

# lets merge dta6 onto elec.dat:
dta7 <- merge(elec.dat, dta6, by="district_year")

elec.dat <- dta7






# -----------------------------------------------------
# For district_year FE, we can use "district_year".  
# Lets use kucoder to make a *second* district_reform FE:

# Make district_reform variable (for district FE - all codes change with reform,
# so make it reform-specific, but we cannot rely on this after reform because it doesn't
# take into account changes in the municipalities that comprise the district):

elec.dat$district_reform2 <- paste(elec.dat$reform, elec.dat$kucoder, sep="_")
head(elec.dat$district_reform2)
class(elec.dat$district_reform2)













# -----------------------------------------------------
# CREATING A NEW VARIABLE THAT INCORPORATES KOMEITO SSD WINNERS

# In the post-reform period, the LDP didn't run in some districts and allowed
# Komeito candidates to run.

# -----------------------------------------
# From the 2000 election, they were coordinating.  Lets get all the districts that have a 
# Komeito winner (cand_01_pty==Komeito) *and* the LDP refrained from running a candidate:

# Subset data by post-electoral reform:
post.red <- subset(elec.dat, elec.dat$year >= 1996 & is.na(elec.dat$muncode_num) == F)

# Get districts where no LDP candidate ran and the Komeito candidate won:
Komwins <- subset(post.red, post.red$cand_01_pty=="Komeito" & post.red$VotesLDP==0)
d_years <- unique(Komwins$district_year)

#sum(post.red$sumLDP_VshareVP==0)
# 5730 municipalities have no LDP winner
#sum(!(post.red$cand_01_pty=="LDP"))
# 5730 municipalities do not have an LDP candidate in first place

# Create sumgov_VshareVP, which records electoral support the municipality gave to
# the coalition' winner:

post.red$sumgov_VshareVP <- NULL
post.red$sumgov_VshareVP <- ifelse(post.red$district_year %in% d_years, 
                                   post.red$VshareVP1,
                                   post.red$sumLDP_VshareVP)

#check <- subset(post.red, post.red$cand_01_pty=="Komeito")
#check[, c("cand_01_votes", "cand_01_pty", 
#          "mun_voting_pop", "VshareVP1", "sumLDP_VshareVP", "sumgov_VshareVP")]

#check <- subset(post.red, post.red$cand_01_pty=="DPJ")
#check[1:100, c("cand_01_votes", "cand_01_pty", 
#          "mun_voting_pop", "VshareVP1", "sumLDP_VshareVP", "sumgov_VshareVP")]

# every observation with sumLDP_VshareVP
sum(is.na(post.red$sumLDP_VshareVP)) # no NAs because if no LDP winner, gets 0.
sum(is.na(post.red$sumgov_VshareVP)) # no NAs because if no LDP winner or Kom winner, gets 0

sum(post.red$sumLDP_VshareVP==0) # 5730 observations with no LDP winner
sum(post.red$sumgov_VshareVP==0) # 5546 observations with no LDP winner or Kom winner



# ----------------------------------------------------
# Now, let us create a ranked version of this variable:

post.red <- transform(post.red, rsumgov_VshareVP = ave(sumgov_VshareVP, district_year,
                                                       FUN = function(x) 
                                                         rank(x, ties.method="average")))
# Within a district-year, municipalities with higher sumgov_VshareVP get higher ranks
# e.g. municipality with the highest sumgov_VshareVP in a district-year with 22 municipalities
# is rank==22.

check <- subset(post.red, post.red$cand_01_pty=="DPJ")
check[1:100, c("cand_01_votes", "cand_01_pty", 
               "mun_voting_pop", "VshareVP1", "sumLDP_VshareVP", "sumgov_VshareVP", "rsumgov_VshareVP")]
# Shows that when sumgov_VshareVP == 0, rsumgov_VshareVP receives a ranking.

check <- subset(post.red, post.red$num=="1")
check[1:100, c("cand_01_votes", "cand_01_pty", 
               "mun_voting_pop", "VshareVP1", "sumLDP_VshareVP", "sumgov_VshareVP", "rsumgov_VshareVP")]
# When there is only one muni in a district, rsumgov_VshareVP receives a ranking (1).

# Fix this (when sumgov_VshareVP == 0, rsumgov_VshareVP should be NA; when num==1,
# sumgov_VshareVP should be NA.

# Get all the district_years where every municipality has sumgov_VshareVP==0:
year_dists <- unique(post.red$district_year)
all_same <- c() 
for(j in 1:length(year_dists)){
  d <- post.red[post.red$district_year==year_dists[j],]
  if(all(d$sumgov_VshareVP==0)){
    all_same <- c(all_same, unique(d$district_year))
  }
}

# For district_years in all_same (no LDP or Kom winner, so sumgov_VshareVP==0 for all),
# create new.rsumgov_VshareVP, which is NA (otherwise, its rsumgov_VshareVP):

post.red$new.rsumgov_VshareVP <- NULL
post.red$new.rsumgov_VshareVP <- ifelse(post.red$district_year %in% all_same,
                                        NA, 
                                        post.red$rsumgov_VshareVP)
# Check:
#check <- post.red[is.na(post.red$new.rsumgov_VshareVP),]
#sum(check$sumgov_VshareVP) # 0

#check <- subset(post.red, post.red$year==1996 & post.red$hor_electoral_district==101)
#check[, c("muncode_num", "district_year", "sumgov_VshareVP", "new.rsumgov_VshareVP")]

#check <- subset(post.red, post.red$VotesLDP==0 & !post.red$cand_01_pty=="Komeito")
#nrow(check)
#sum(is.na(check$new.rsumgov_VshareVP))



# -----------------------------------------
# Lets make sure new.rsumgov_VshareVP is also NA for observations in districts where num==1

check <- subset(post.red, post.red$num==1)
check$new.rsumgov_VshareVP # some have 1, all should have NA
nrow(check)
# 162 altogether
sum(is.na(check$new.rsumgov_VshareVP))
# 59 have NA (all need to have NA)

post.red$new.new.rsumgov_VshareVP <- NULL
post.red$new.new.rsumgov_VshareVP <- ifelse(post.red$num==1, NA, post.red$new.rsumgov_VshareVP )

check <- subset(post.red, post.red$num==1)
sum(is.na(check$new.new.rsumgov_VshareVP))
# 162

# rsumgov_VshareVP has no NAs:
sum(is.na(post.red$rsumgov_VshareVP))
# 0

# new.rsumgov_VshareVP has more NAs:
sum(is.na(post.red$new.rsumgov_VshareVP))
# 5546

# new.new.rsumgov_VshareVP has even more NAs:
sum(is.na(post.red$new.new.rsumgov_VshareVP))
# 5649




# -----------------------------------------
# new.new.rsumgov_VshareVP contains absolute rankings, with NAs for observations
# in districts without LDP winners and with a single muni per district. 

# Let us standardize this across districts so that 1 is the municipality ranking highest on this 
# in a district and 0 is the lowest-ranked municipality in the district.

# Let us do this for every municipality-year in which new.new.rsumgov_VshareVP 
# is not NA.  We use this formula: (absolute rank-1)/(total number of munis-1) 

# For the municipality-years in which the ranked variable "(new.new.rsumgov_VshareVP ")
# *is* NA, the new variable can be assigned "NA". 

rel.rank.rsumgov_VshareVP <- c()

for(j in 1:nrow(post.red)){
  d <- post.red[j,]
  if(!is.na(d$new.new.rsumgov_VshareVP)){
    rel.rank <- (d$new.new.rsumgov_VshareVP-1)/(d$num-1)}else 
      rel.rank <- NA
    rel.rank.rsumgov_VshareVP <- c(rel.rank.rsumgov_VshareVP, rel.rank)    
}
length(rel.rank.rsumgov_VshareVP) == nrow(post.red)
post.red$rel.rank.rsumgov_VshareVP <- rel.rank.rsumgov_VshareVP

# Check (LDP district winner)
#check <- subset(post.red, post.red$year==1996 & post.red$hor_electoral_district==1001)
#check[, c("muncode_num", "sumLDP_VshareVP", "rsumLDP_VshareVP", 
#          "sumgov_VshareVP", "new.new.rsumgov_VshareVP", "rel.rank.rsumgov_VshareVP")]

# Check (Kom district winner)
check <- subset(post.red, post.red$year==2000 & post.red$hor_electoral_district==2703)
check[, c("muncode_num", "sumLDP_VshareVP", "rsumLDP_VshareVP", 
          "sumgov_VshareVP", "new.new.rsumgov_VshareVP", "rel.rank.rsumgov_VshareVP")]

# Check (DPJ district winner)
check <- subset(post.red, post.red$year==1996 & post.red$hor_electoral_district==101)
check[, c("muncode_num", "sumLDP_VshareVP", "rsumLDP_VshareVP", 
          "sumgov_VshareVP", "new.new.rsumgov_VshareVP", "rel.rank.rsumgov_VshareVP")]

# lets keep rel.rank.rsumgov_VshareVP but call it rsumgov_VshareVP (to match rsumLDP_VshareVP) 
post.red$rsumgov_VshareVP <- NULL
post.red$rsumgov_VshareVP <- post.red$rel.rank.rsumgov_VshareVP
post.red$rel.rank.rsumgov_VshareVP <- NULL

# lets keep new.new.rsumgov_VshareVP but call it abs.rsumgov_VshareVP
post.red$abs.rsumgov_VshareVP <- post.red$new.new.rsumgov_VshareVP
post.red$new.new.rsumgov_VshareVP <- NULL

# final one to delete
post.red$new.rsumgov_VshareVP <- NULL

names(post.red)

# done! Stored in rsumgov_VshareVP and abs.rsumgov_VshareVP





# -------------------------------------------
# Attach a variable for districts in which Komeito runs and LDP candidate does not:
# Stored in d_years

sum(post.red$district_year %in% d_years)
# 184

post.red$KomSSD <- ifelse(post.red$district_year %in% d_years, 1, 0)

table(post.red$KomSSD)
# 0     1 
# 16944   184

# -------------------------------------------
# Now, we need to merge the new variables we made back onto elec.dat:

newvars <- c("id", "sumgov_VshareVP", "rsumgov_VshareVP", "abs.rsumgov_VshareVP",
             "KomSSD")

# Restrict to id plus new vars:
tomerge <- post.red[newvars]

# Lets attach these new variables onto elec.dat:
dta <- merge(elec.dat, tomerge, by="id", all=T)

elec.dat <- dta

# NB: sumLDP_VshareVP was made for all observations with elections data but sumgov_VshareVP
# was not, so thre aremore NAs in the latter than the former 
#check <- subset(elec.dat, elec.dat$year >=1996)
#sum(is.na(check$sumLDP_VshareVP))
#sum(is.na(check$sumgov_VshareVP))
# 713.... these observations don't hsv a muncode_num
#check2 <- subset(check, is.na(check$sumgov_VshareVP))
#nrow(check2)            
# 713









# -----------------------------------------------------
# ENCOMPASSING LDP LOSERS WHO ARE RESURRECTED IN PR INTO THIS

# Lets read in Reed-Smith data and get district_years that saw LDP candidates
# lose the SMD and enter via the PR tier:

dta <- read.csv("Reed-Smith-JHRED.csv")
dta2 <- subset(dta, dta$year>1993 & dta$byelection==0 & 
                 dta$kucode!=0 & dta$party_en=="LDP" & dta$result==2)
dta2$district_year <- paste(dta2$year, dta2$kucode, sep="_")
resurrect <- dta2$district_year
# 261 district_years




# -------------------------------------------
# Lets extract these district_years from post.red, and just make sure they 
# don't have cand_01_pty==LDP

post.red <- subset(elec.dat, elec.dat$year >= 1996 & is.na(elec.dat$muncode_num) == F)
check <- subset(post.red, post.red$district_year %in% resurrect)
length(unique(check$district_year)) 
# 243 (the remaining district_years either were split munis
# or didn't have NEEDs data)
table(check$cand_01_pty) 
# in these, there are no LDP or Komeito candidates in first place





# -----------------------------------------
#  In these district_years, which place is the LDP candidate in?

resurrect2 <- subset(post.red, post.red$district_year %in% resurrect)
nrow(resurrect2)  # 1737 obs
table(resurrect2$cand_02_pty) # in 1586 observations, the LDP candidate is in second place
table(resurrect2$cand_03_pty) # in 151 obs, the LDP cand is in third place

# Get the district_years in which the LDP candidate is second:
second <- subset(resurrect2, resurrect2$cand_02_pty=="LDP")
nrow(second) # 1586
sd_years <- unique(second$district_year)

# In these district_years, LDP candidate is third
third <- subset(resurrect2, resurrect2$cand_03_pty=="LDP")
nrow(third) # 151
td_years <- unique(third$district_year)

length(unique(resurrect2$district_year)) == length(sd_years)+ length(td_years)


# Make a new variable called sumallLDP_VshareVP: if a municipality's district_year is in
# sd_years, assign this the vote share of the second-place getter (which is the LDP
# candidate), if its district_year is in td_years, assign it the vote share of the 
# third-place getter (which is the LDP candidate).  If its district_year is in neither of these,
# assign it its sumLDP_VshareVP score (which is either 0 or populated).

sum(post.red$sumLDP_VshareVP==0)
#5730

post.red$sumallLDP_VshareVP <- NULL
post.red$sumallLDP_VshareVP <- ifelse(post.red$district_year %in% sd_years, post.red$VshareVP2, 
                                      ifelse(post.red$district_year %in% td_years, post.red$VshareVP3,
                                             post.red$sumLDP_VshareVP))
sum(post.red$sumall_VshareVP==0)
# 3993
# fewer 0s than sumLDP_VshareVP because some have been populated with vote shares of
# resurrected LDP candidates.

check <- subset(post.red, post.red$district_year %in% resurrect)

nrow(check) == sum(check$sumLDP_VshareVP==0) 
# TRUE (should be 0s, it is).

sum(check$sumallLDP_VshareVP==0) 
# there should be no 0s.  There are no 0s.



# ----------------------------------------------------
# Now, let us create a ranked version of this variable:

post.red <- transform(post.red, rsumallLDP_VshareVP = ave(sumallLDP_VshareVP, district_year,
                                                          FUN = function(x) 
                                                            rank(x, ties.method="average")))
# Within a district-year, municipalities with higher sumallLDP_VshareVP get higher ranks
# e.g. municipality with the highest sumallLDP_VshareVP in a district-year with 22 municipalities
# is rank==22.

resurrect[1]
check <- subset(post.red, post.red$district_year=="1996_102")
check[, c("cand_01_votes", "cand_01_pty", 
          "cand_02_pty", "VshareVP2", "sumLDP_VshareVP", "sumallLDP_VshareVP", "rsumallLDP_VshareVP")]
# Shows that when sumallLDP_VshareVP == 0, rsumallLDP_VshareVP receives a ranking.

check <- subset(post.red, post.red$district_year %in% resurrect & post.red$num=="1")
check[, c("district_year", "cand_01_pty", "cand_02_pty",
          "VshareVP2", "sumLDP_VshareVP", "sumallLDP_VshareVP", "rsumallLDP_VshareVP")]
# When there is only one muni in a district, rsumallLDP_VshareVP receives a ranking (1).

# Fix this (when sumallLDP_VshareVP == 0, rsumallLDP_VshareVP should be NA; when num==1,
# sumallLDP_VshareVP should be NA.

# Get all the district_years where every municipality has sumallLDP_VshareVP==0
# No LDP candidate, even s a PR resurrected candidate):
year_dists <- unique(post.red$district_year)
all_same <- c() 
for(j in 1:length(year_dists)){
  d <- post.red[post.red$district_year==year_dists[j],]
  if(all(d$sumallLDP_VshareVP==0)){
    all_same <- c(all_same, unique(d$district_year))
  }
}

# For district_years in all_same (no LDP winner, SMD or otherwise, so 
# sumallLDP_VshareVP==0 for all), create new.rsumallLDP_VshareVP, which is NA 
# (otherwise, its rsumallLDP_VshareVP):

post.red$new.rsumallLDP_VshareVP <- NULL
post.red$new.rsumallLDP_VshareVP <- ifelse(post.red$district_year %in% all_same,
                                           NA, 
                                           post.red$rsumallLDP_VshareVP)
# Check:
#check <- post.red[is.na(post.red$new.rsumallLDP_VshareVP),]
#sum(check$sumallLDP_VshareVP) # 0



# -----------------------------------------
# Lets make sure new.rsumallLDP_VshareVP is also NA for observations in districts where num==1

check <- subset(post.red, post.red$num==1)
check$new.rsumallLDP_VshareVP # some have 1, all should have NA
nrow(check)
# 162 altogether
sum(is.na(check$new.rsumallLDP_VshareVP))
# 50 have NA (all need to have NA)

post.red$new.new.rsumallLDP_VshareVP <- NULL
post.red$new.new.rsumallLDP_VshareVP <- ifelse(post.red$num==1, NA, post.red$new.rsumallLDP_VshareVP )

check <- subset(post.red, post.red$num==1)
nrow(check)
sum(is.na(check$new.new.rsumallLDP_VshareVP))
# 162

# rsumallLDP_VshareVP has no NAs:
sum(is.na(post.red$rsumallLDP_VshareVP))
# 0

# new.rsumallLDP_VshareVP has more NAs:
sum(is.na(post.red$new.rsumallLDP_VshareVP))
# 3993

# new.new.rsumallLDP_VshareVP has even more NAs:
sum(is.na(post.red$new.new.rsumallLDP_VshareVP))
# 4105




# -----------------------------------------
# new.new.rsumallLDP_VshareVP contains absolute rankings, with NAs for observations
# in districts without LDP winners (resurrected or otherwise) and with a single 
# muni per district. 

# Let us standardize this across districts so that 1 is the municipality ranking highest on this 
# in a district and 0 is the lowest-ranked municipality in the district.

# Let us do this for every municipality-year in which new.new.rsumallLDP_VshareVP 
# is not NA.  We use this formula: (absolute rank-1)/(total number of munis-1) 

# For the municipality-years in which the ranked variable "(new.new.rsumallLDP_VshareVP ")
# *is* NA, the new variable can be assigned "NA". 

rel.rank.rsumallLDP_VshareVP <- c()

for(j in 1:nrow(post.red)){
  d <- post.red[j,]
  if(!is.na(d$new.new.rsumallLDP_VshareVP)){
    rel.rank <- (d$new.new.rsumallLDP_VshareVP-1)/(d$num-1)}else 
      rel.rank <- NA
    rel.rank.rsumallLDP_VshareVP <- c(rel.rank.rsumallLDP_VshareVP, rel.rank)    
}
length(rel.rank.rsumallLDP_VshareVP) == nrow(post.red)
post.red$rel.rank.rsumallLDP_VshareVP <- rel.rank.rsumallLDP_VshareVP

resurrect[1]
check <- subset(post.red, post.red$district_year=="1996_102")
nrow(check)
check[, c("muncode_num", 
          "cand_02_pty", "VshareVP2",
          "sumLDP_VshareVP", "rsumLDP_VshareVP", 
          "sumallLDP_VshareVP", "new.new.rsumallLDP_VshareVP", "rel.rank.rsumallLDP_VshareVP")]

resurrect[10]
check <- subset(post.red, post.red$district_year=="2005_110")
nrow(check)
check[, c("muncode_num", 
          "cand_03_pty", "VshareVP3",
          "sumLDP_VshareVP", "rsumLDP_VshareVP", 
          "sumallLDP_VshareVP", "new.new.rsumallLDP_VshareVP", "rel.rank.rsumallLDP_VshareVP")]

# lets keep rel.rank.rsumallLDP_VshareVP but call it rsumallLDP_VshareVP (to match rsumLDP_VshareVP) 
post.red$rsumallLDP_VshareVP <- NULL
post.red$rsumallLDP_VshareVP <- post.red$rel.rank.rsumallLDP_VshareVP
post.red$rel.rank.rsumallLDP_VshareVP <- NULL

# lets keep new.new.rsumallLDP_VshareVP but call it abs.rsumallLDP_VshareVP
post.red$abs.rsumallLDP_VshareVP <- post.red$new.new.rsumallLDP_VshareVP
post.red$new.new.rsumallLDP_VshareVP <- NULL

# final one to delete
post.red$new.rsumallLDP_VshareVP <- NULL

names(post.red)

# done! Stored in rsumallLDP_VshareVP and abs.rsumallLDP_VshareVP




# -------------------------------------------
# Attach a variable for districts in which LDP lost and was resurrected in PR:

sum(post.red$district_year %in% resurrect)
# 1737

post.red$resurrectSSD <- ifelse(post.red$district_year %in% resurrect, 1, 0)

table(post.red$resurrectSSD)
# 0     1 
# 15391  1737




# -------------------------------------------
# Now, we need to merge this back into elec.dat

newvars <- c("id", "sumallLDP_VshareVP", "rsumallLDP_VshareVP", "abs.rsumallLDP_VshareVP",
             "resurrectSSD")

# Restrict to id plus new vars:
tomerge <- post.red[newvars]

# Lets attach these new variables onto elec.dat:
dta <- merge(elec.dat, tomerge, by="id", all=T)

elec.dat <- dta







# -------------------------------------------
# CALCULATING ABSOLUTE RANKINGS FOR rsumLDP_VshareVP

# ------------------------------------------------
# To identify municipalities at the highest and second-highest rank in each district-year, 
# begin by making separate data sets for each district-year:

split_yeardist <- split(elec.dat, elec.dat$district_year)

# Write a function that can take a variable and create an absolute ranking of it
# for all observations in that district-year (high scores indicate LOWER ranks, less electoral support):
reverse_rank <- function(x, var){
  rk <- rank(x[,var])
  maxrk <- max(rk)
  abs.rsumLDP_VshareVP <- maxrk - rk + 1 
  # change the above line for another var
  return(cbind(x[,c("year", "muncode_num", "id")], abs.rsumLDP_VshareVP))
}

# Apply this to "rsumLDP_VshareVP" in split_yeardist:
split_yeardist.rank <- lapply(split_yeardist, reverse_rank, "rsumLDP_VshareVP")
split_yeardist.rank[[1]]

# Create an empty data frame and fill it with the element in 
# split_yeardist.rank list:
merge_rr <- data.frame()
for (i in split_yeardist.rank) {
  merge_rr <- rbind(merge_rr, i)
}

tomerge <- merge_rr[, -c(1:2)]

# Attach this variable back onto elec.dat:
elec.dat.ranks <- merge(elec.dat, tomerge, by = c("id"))

# Error checking:
check <- subset(elec.dat.ranks, elec.dat.ranks$year == 1980 & 
                  elec.dat.ranks$hor_electoral_district == 101)
check[, c("year", "muncode_num", "hor_electoral_district", "num", "sumLDP_VshareVP", "rsumLDP_VshareVP", "abs.rsumLDP_VshareVP")]

# Districts without an LDP candidate:
check <- subset(elec.dat.ranks, elec.dat.ranks$VotesLDP==0)
check <- subset(elec.dat.ranks, elec.dat.ranks$district_year=="2014_110")
check[, c("year", "muncode_num", "hor_electoral_district", "num", "sumLDP_VshareVP", "rsumLDP_VshareVP", "abs.rsumLDP_VshareVP")]

# NB: when there is no LDP winner in a district, sumLDP_VshareVP = 0 and rsumLDP_VshareVP is NA,
# but abs.sumLDP_VshareVP is populated.  We need to make these NA.

# Lets pull all the year_districts where rsumLDP_VshareVP is NA and store them
# in a vector called all_same:
year_dists <- unique(elec.dat.ranks$district_year)
all_same <- c() 
for(j in 1:length(year_dists)){
  d <- elec.dat.ranks[elec.dat.ranks$district_year==year_dists[j],]
  if(all(is.na(d$rsumLDP_VshareVP))){
    all_same <- c(all_same, unique(d$district_year))
  }
}
all_same # 892

# Populate a new column with the absolute ranks
elec.dat.ranks$new.abs.rsumLDP_VshareVP <- elec.dat.ranks$abs.rsumLDP_VshareVP 

# For each observation in this column, if its district_year is in all_same, 
# assign its abs.rsumLDP_VshareVP to NA:
elec.dat.ranks$new.abs.rsumLDP_VshareVP <- ifelse(elec.dat.ranks$district_year %in% all_same, NA, elec.dat.ranks$new.abs.rsumLDP_VshareVP)

check <- subset(elec.dat.ranks, elec.dat.ranks$district_year=="2014_110")
check[, c("year", "muncode_num", "hor_electoral_district", "num", 
          "sumLDP_VshareVP", "rsumLDP_VshareVP", "abs.rsumLDP_VshareVP", "new.abs.rsumLDP_VshareVP")]

# Replace old abs.rsumLDP_VshareVP with new.abs.rsumLDP_VshareVP, but call it abs.rsumLDP_VshareVP

elec.dat.ranks$abs.rsumLDP_VshareVP <- NULL 
elec.dat.ranks$abs.rsumLDP_VshareVP <- elec.dat.ranks$new.abs.rsumLDP_VshareVP
elec.dat.ranks$new.abs.rsumLDP_VshareVP <- NULL 

# Check:
ex <- elec.dat.ranks[is.na(elec.dat.ranks$rsumLDP_VshareVP),]
nrow(ex)
sum(is.na(ex$abs.rsumLDP_VshareVP))

# Absolute rankings on rsum are in elec.dat.ranks$abs.rsumLDP_VshareVP

elec.dat <- elec.dat.ranks


# ---------------------------------------------
# Save just the turnout+transfer data:

saveRDS(elec.dat, file = "Book_Mun_Level_Transfers.rds")







# -----------------------------------------------------
# Merge it back into dat and save the entire file out

# New variables we made on elec.dat (and want to keep), plus id, are:

newvars <- c("id", "kucoder", "district_reform2", "sumgov_VshareVP", "rsumgov_VshareVP",
             "abs.rsumgov_VshareVP", "KomSSD", "sumallLDP_VshareVP", 
             "rsumallLDP_VshareVP", "abs.rsumallLDP_VshareVP", "resurrectSSD", 
             "abs.rsumLDP_VshareVP")

elec.dat2 <- elec.dat[newvars]

dat1 <- merge(dat, elec.dat2, by="id", all=T)

saveRDS(dat1, file = "Master_plus_Snow_TurnTrans.rds")









# ---------------------------------------
# PART 3. 
# VARIABLES NEEDED FOR DISTRICT-LEVEL TRANSFERS ANALYSES
# ---------------------------------------

dat <- readRDS("Master_plus_Snow_TurnTrans.rds")






# -----------------------------------------------------
#
# Make Lngaid_pc (lagged ngaid_pc variable)

myvars <- c("id", "year", "muncode_num", "ngaid_pc")
dat2 <- dat[myvars]

# Restrict dataset to municpality-years with NEEDs data:
dat3 <- dat2[!is.na(dat2$muncode_num),]

# Further restrict to municipalities where mun_ngaid is populated:
dat4 <- dat3[!is.na(dat3$ngaid_pc),]

# They all have a mun_code_num, so lets order by muncode_num first, and then year:
dat5 <- dat4[order(dat4[,3], dat4[,2], decreasing = F),]
dat5[1:100,]

# Lets make sure there is only one muncode_num per year
dat5$check <- paste(dat5$muncode_num, dat5$year, sep="_")
length(dat5$check) - length(unique(dat5$check)) # 0 duplicated observations
dat5$check <- NULL # delete column

dat5$time <- dat5$year

library(data.table)
dat6 <- as.data.table(dat5)

# Create lagged ngaid_pc variable (Lngaid_pc):
dat6 <- dat6 [, Lngaid_pc:=shift(ngaid_pc, 1:2, type = "lag"), by = muncode_num]
# Can check here:
dat6[1:50,]

# We made these variables on a subset of data. Lets merge this back into dat:
names(dat6)
smalldat <- dat6[, c(1, 6)]

# Lets attach these new variables onto dat:
dta <- merge(dat, smalldat, by="id", all=T)

dat <- dta





# -----------------------------------------
# CREATING DISTRICT-LEVEL VARIABLES (financial and all)

# We have to make all the district-level variables for the 2014 election, except those that
# were made exclusively with JED-M data, which we had for 2014.

# ------------------------------------------
# Variables that were made exclusively with the JED data in 2014 so are fine:
# Verified by checking for NAs and looking at vars

check <- subset(dat, dat$year==2014 & !is.na(dat$hor_electoral_district))

# VotePop: sum(mun_voting_pop) in each district-year:
sum(is.na(check$VotePop)) 

# VotesLDP: LDP_`t'*cand_0`t'_votes (how many votes the municipality cast for LDP cands)
# When 0, it means a Komeito candidate ran
sum(is.na(check$VotesLDP)) 

# VotesWinLDP: LDP_`t'*cand_0`t'_votes*win_`t'(how many votes municipality cast for winning LDP cands)
sum(is.na(check$VotesWinLDP)) 

# frac: mun_voting_pop/VotePop
sum(is.na(check$frac))

# (district-level var): PopIndex: sum(frac^2)
sum(is.na(check$PopIndex))

# num: number of municipalities in each district-year
sum(is.na(check$num))

# (district-level var) HI: 
# (PopIndex-1/num)/(1-1/num) (stretches Popindex from 0 to 1, when num==1, HI is NA)
sum(is.na(check$HI))
check2 <- subset(check, is.na(check$HI))
sum(is.na(check$HI)) == sum(check2$num==1)
# yes (all HI== NA are municipalities in districts with one municipality)
# Checked various ways.  By the way, 24 SSDs in 2014 have one municipality in the district.

# logNum: log(num)
sum(is.na(check$logNum))

# sdVotePop 
# (its not sd(VotePop), we calculate logVP = log(VotePop) for each muni first, and then sd(logVP) in a district year:
sum(is.na(check$sdVotePop))
check2 <- subset(check, is.na(check$sdVotePop))
sum(is.na(check$sdVotePop)) == sum(check2$num==1)

# DisLDPvotes: sum(VotesLDP)
sum(is.na(check$DisLDPvotes))

# DisLDPVS: DisLDPvotes/voting_pop_in_electoral_district
sum(is.na(check$DisLDPVS))

# DisWinLDPvotes: sum(VotesWinLDP)
sum(is.na(check$DisWinLDPvotes))

# DisWinLDPVS: DisWinLDPvotes/voting_pop_in_electoral_district
sum(is.na(check$DisWinLDPVS))





# ------------------------------------------
# Lets subset to electoral data only

elec.dat <- dat[!is.na(dat$hor_electoral_district),]

# ------------------------------
# DisLDPVS and DisWinLDPVS are constructed with "voting_pop_in_electoral_district" 
# as the denominator.  This is from ku_electorate (RS data).  It includes the population
# in split municipalities, which are not in the data.

# Lets recreate the district-level LDP support variables with the split munis excluded 
# (Vote Pop is the voting population in the district calculated with the split munis excluded):  

elec.dat$DisLDPVS2 <- elec.dat$DisLDPvotes/elec.dat$VotePop
elec.dat$DisWinLDPVS2 <- elec.dat$DisWinLDPvotes/elec.dat$VotePop



# -----------------------
# Now, lets remake all the district-level variables (we only need to remake them for 
# 2014, but lets remake all at the same time, so how we do it is consistent).

# The district-level transfer variables are made by taking all the munis in a 
# given district-year and summing up the transfers they received in that
# district-year.

# The district-level lag and lead transfer variables are made by taking all the munis
# in a given district-year and summing up their lag and lead transfer amounts.  If a municipality
# ceased to exist in a year after an election, its lead variables would be NA, so would not be summed.
# If a municipality didn't exist the prior year, its lag variables would be NA, so 
# this would not enter the summation, either.

dist_year <- unique(elec.dat$district_year)

all.the.years <- matrix(as.character(NA), ncol=15, nrow=0)
all.the.years <- as.data.frame(all.the.years)

for(j in 1:length(dist_year)){
  
  d <- subset(elec.dat, elec.dat$district_year==dist_year[j])
  
  district_year <- unique(d$district_year)
  m <- unique(d$totseat_in_electoral_district)
  
  # First, make the district-level variables for that year:
  DISpop <- sum(na.omit(d$mun_population))
  
  d$fracpop <-  d$mun_population/DISpop
  DISceif <- sum(na.omit(d$mun_ceif*d$fracpop))
  DISneedy <- sum(na.omit(d$needy_pc*d$fracpop))
  DISprimary <- sum(na.omit(d$primary_pc*d$fracpop))
  DISincomepc <- sum(na.omit(d$income_pc*d$fracpop))
  DISarea <- sum(na.omit(d$mun_area_size)) 
  DISdensity <- DISpop/(1000*DISarea)
  DISdensity2 <- DISpop/DISarea
  DISngaid <- sum(na.omit(d$ngaid_pc*d$mun_population)) # was DisMoney (total ngaid received by district)
  DISngaidpc <- DISngaid/DISpop # was DisMoneypc (ngaid per person received by district)
  DISmal <- DISpop/(100000*m)
  
  # Second, make the lead and lag variables:
  
  FDISngaid <- sum(na.omit(d$F1ngaid_pc*d$mun_population))
  # This was FDisMoney.  For each muni in the district, it takes the per person money it received
  # the subsequent year, multiples it by people in the municipality in the current year, and sums them
  # AS used sum(exp(F1logngaid_pc)* mun_population)
  
  FDISngaidpc <- FDISngaid/DISpop
  # This was FDismoneypc
  
  LDISngaid <- sum(na.omit(d$Lngaid_pc*d$mun_population))
  # Wasn't made in original data
  
  LDISngaidpc <- LDISngaid/DISpop
  # Wasn't made in original data
  
  all.the.years <- rbind(all.the.years, cbind(district_year, DISpop, DISceif, 
                                              DISneedy, DISprimary, DISincomepc, 
                                              DISarea, DISdensity, DISdensity2, DISngaid, DISngaidpc, DISmal,
                                              FDISngaid, FDISngaidpc, LDISngaid, LDISngaidpc))
}

all.the.years$DISpop <- as.numeric(as.character(all.the.years$DISpop))
all.the.years$DISceif <- as.numeric(as.character(all.the.years$DISceif))
all.the.years$DISneedy <- as.numeric(as.character(all.the.years$DISneedy))
all.the.years$DISprimary <- as.numeric(as.character(all.the.years$DISprimary))
all.the.years$DISincomepc <- as.numeric(as.character(all.the.years$DISincomepc))
all.the.years$DISarea <- as.numeric(as.character(all.the.years$DISarea))
all.the.years$DISdensity <- as.numeric(as.character(all.the.years$DISdensity))
all.the.years$DISdensity2 <- as.numeric(as.character(all.the.years$DISdensity2))
all.the.years$DISngaid <- as.numeric(as.character(all.the.years$DISngaid))
all.the.years$DISngaidpc <- as.numeric(as.character(all.the.years$DISngaidpc))
all.the.years$DISmal <- as.numeric(as.character(all.the.years$DISmal))
all.the.years$FDISngaid <- as.numeric(as.character(all.the.years$FDISngaid))
all.the.years$FDISngaidpc <- as.numeric(as.character(all.the.years$FDISngaidpc))
all.the.years$LDISngaid <- as.numeric(as.character(all.the.years$LDISngaid))
all.the.years$LDISngaidpc <- as.numeric(as.character(all.the.years$LDISngaidpc))

dim(all.the.years)





# -------------------------------------------------------
# FYI, of the 2637 district-years, how many don't have DISngaidpc?
# This means that they were comprised only of municipalities for which we don't
# have transfer (and likely other financial) data:
check <- subset(all.the.years, all.the.years$DISngaidpc==0)
dim(check)
# 273

# How many don't have lagged transfers?
check <- subset(all.the.years, all.the.years$LDISngaidpc==0)
dim(check)
# 400

# How many don't have lead transfers?
check <- subset(all.the.years, all.the.years$FDISngaidpc==0)
dim(check)
# 282


# -------------------------------------------
# Now, lets merge the data saved in all.the.years back into elec.dat.

# First, lets get rid of the 12 equivalent district-level variables 
# that AS made in Stata to avoid confusion:

ncol(elec.dat)
elec.dat$DisPop <- NULL 
elec.dat$DISmun_ceif <- NULL 
elec.dat$DISmun_needy_pc <- NULL 
elec.dat$DISprimary_pc <- NULL 
elec.dat$DISincome_pc <- NULL 
elec.dat$DIS_area <- NULL 
elec.dat$DisPopDensity <- NULL 
elec.dat$DisMoney <- NULL
elec.dat$DisMoneypc <- NULL
elec.dat$malaportion <- NULL 
elec.dat$FDisMoney <- NULL
elec.dat$FDismoneypc <- NULL
ncol(elec.dat)


# Lets attach all.the.years data onto elec.dat via district_year:

# Lets attach these 15 new variables onto elec.dat:
dta <- merge(elec.dat, all.the.years, by="district_year", all=T)
ncol(dta) - ncol(elec.dat)
# 15 new columns added

# Can check:
check <- subset(dta, dta$district_year=="1989_101")
length(unique(check$DISpop))
length(unique(check$DISceif))
length(unique(check$DISneedy))
length(unique(check$DISprimary))
length(unique(check$DISincomepc))
length(unique(check$DISarea))
length(unique(check$DISdensity))
length(unique(check$DISdensity2))


elec.dat <- dta









# -----------------------------------------------
# TAKING LOGS 
# In our regression analyses, we use logs of some of these variables.

ncol(elec.dat) # 489

# ---------------------------------
# Lets remake the two logged variables that were made in the original
elec.dat$DisLogPop <- NULL 
elec.dat$DisLogIncome <- NULL 

elec.dat$logDISpop <- log(elec.dat$DISpop)
elec.dat$logDISincomepc <- log(elec.dat$DISincomepc)

check <- subset(elec.dat, !is.finite(elec.dat$logDISpop))
length(unique(check$district_year))
# In 46 district_years, DISpop is 0.  This means that these districts
# were comprised of municipalities for which we did not have population data.
# Almost all of them are in 2014.

check <- subset(elec.dat, !is.finite(elec.dat$logDISincomepc))
length(unique(check$district_year))
# In 319 district_years, DISincomepc is 0. This means that these districts
# were comprised of municipalities for which we did not have income data.

# -----------------------------------------
# Lets make the logged district-level money variables

elec.dat$logLDISngaidpc <- log(elec.dat$LDISngaidpc) 
elec.dat$logDISngaidpc <- log(elec.dat$DISngaidpc) # was previously logDismoneypc <- log(DisMoneypc)
elec.dat$logFDISngaidpc <- log(elec.dat$FDISngaidpc) # was previously logFDismoneypc <- log(FDismoneypc)

check <- subset(elec.dat, !is.finite(elec.dat$logDISngaidpc))
length(unique(check$district_year))
# In 319 district_years, DISngaidpc is 0.  This means that these districts
# were comprised of municipalities for which we did not have ngaid data.

check <- subset(elec.dat, !is.finite(elec.dat$logFDISngaidpc))
length(unique(check$district_year))
# In 328 district-years, FDISngaidpc is 0.





# -----------------------------------------------
# CAPTURING SENIORS 

# Lets make a variable counting the number of senior LDP winners in the municipality's district:

numLDPwinsen <- c()

for(j in 1:nrow(elec.dat)){
  
  dset <- subset(elec.dat[j,])
  
  seniors <- sum(dset$LDP_1*dset$win_1*dset$senior_1,
                 dset$LDP_2*dset$win_2*dset$senior_2,
                 dset$LDP_3*dset$win_3*dset$senior_3,
                 dset$LDP_4*dset$win_4*dset$senior_4,
                 dset$LDP_5*dset$win_5*dset$senior_5, 
                 dset$LDP_6*dset$win_6*dset$senior_6, na.rm=T)
  numLDPwinsen <- c(numLDPwinsen, seniors)
}

elec.dat$numLDPwinsen <- numLDPwinsen                       





# -----------------------------------------------------
# Merge it back into dat and save the entire file out

# New variables we made (and want to keep), plus id, are:

newvars <- c("id", "DisLDPVS2", "DisWinLDPVS2", "DISpop",
             "DISceif", "DISneedy", "DISprimary", "DISincomepc", "DISarea", "DISdensity", "DISdensity2",
             "DISngaid", "DISngaidpc", "DISmal", "FDISngaid", 
             "FDISngaidpc", "LDISngaid", "LDISngaidpc", "logDISpop", "logDISincomepc", 
             "logLDISngaidpc", "logDISngaidpc", "logFDISngaidpc", "numLDPwinsen")

elec.dat2 <- elec.dat[newvars]

dat1 <- merge(dat, elec.dat2, by="id", all=T)

saveRDS(dat1, file = "Master_plus_Snow_Turn_Trans_Dis.rds")








# ---------------------------------------
# PART 4. 
# 
# Lets remake mun_population_density so it is not the rescaled version. 
# ---------------------------------------

dat <- readRDS("Master_plus_Snow_Turn_Trans_Dis.rds")

dim(dat)
# 105353    506

# Make a new mun_population_density2 variable by putting population/area size
dat$mun_population_density2 <- dat$mun_population/dat$mun_area_size
summary(dat$mun_population_density2, exclude=NULL) # 1545 na
boxplot(dat$mun_population_density2)

# The mun_population_density variable is already in there, it should be this divided by 1,000:
summary(dat$mun_population_density, exclude=NULL) # 1545 na
# yes, same

check <- subset(dat, dat$mun_population_density2==0)
sum(is.na(dat$mun_population_density2))
# There are 1545 NAs.

new.dat <- subset(dat, !is.na(dat$mun_population_density2))
new.dat$lndensity <- NULL
new.dat$lndensity <- log(new.dat$mun_population_density2)
new.dat[100:115, c("year", "mun_population", "mun_area_size", "mun_population_density2", "lndensity")]

# Reduce new.dat to id, and lndensity,
myvars <- c("id", "lndensity")
newdat2 <- new.dat[myvars]

# Get rid of lndensity in dat
dat$lndensity <- NULL

# Attach newdat2 onto original dat
dat1 <- merge(dat, newdat2, by="id", all=T)

#sum(is.na(dat1$mun_population_density))
#sum(is.na(dat1$lndensity))
#check <- subset(dat1, !is.na(dat1$mun_population_density))
#sum(is.na(check$lndensity))

# Save out:
saveRDS(dat1, file = "Master_plus_Snow_Turn_Trans_Dis2.rds")









# ---------------------------------------
# PART 5. 
# 
# Lets add variables used in turnout analyses
# ---------------------------------------

dat <- readRDS("Master_plus_Snow_Turn_Trans_Dis2.rds")

# get rid of some variables we remake below

dat$votes_single_LDPc <- NULL
dat$votes_single_LDPw <- NULL
dat$vs_single_LDPc <- NULL
dat$vs_single_LDPw <- NULL
dat$Lvs_single_LDPw <- NULL

# ------------------------------------
elec.dat <- dat[!is.na(dat$hor_electoral_district),]

# ------------------------------------
# Make a dummy ldpsenior variable:

sum(is.na(elec.dat$numLDPwinsen))
elec.dat$ldpsenior <- ifelse(elec.dat$numLDPwinsen!=0, 1, 0)
table(elec.dat$ldpsenior)
class(elec.dat$ldpsenior)




# ------------------------------------
#  make vs_single_c and rvs_single_c

# To measure the degree to which a group strongly associates with one party,
# Votes for a single candidate/votes cast).

# In each municipality-year, get the votes of all the candidates who ran, and 
# record the max.

votes_single_c <- c()

for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  votes <- c(d$cand_01_votes,
             d$cand_02_votes,
             d$cand_03_votes,
             d$cand_04_votes,
             d$cand_05_votes,
             d$cand_06_votes,
             d$cand_07_votes,
             d$cand_08_votes,
             d$cand_09_votes,
             d$cand_10_votes,
             d$cand_11_votes,
             d$cand_12_votes,
             d$cand_13_votes,
             d$cand_14_votes,
             d$cand_15_votes,
             d$cand_16_votes,
             d$cand_17_votes)
  
  maxvotes <- c(max(votes))
  
  votes_single_c <- c(votes_single_c, maxvotes)
  
  cat("done",j,"of",nrow(elec.dat),"\n")
}

length(votes_single_c)
elec.dat$votes_single_c <- votes_single_c                       
range(elec.dat$votes_single_c, exclude = NULL)

# interesting case, as votes divided evenly among candidates.  No concentration.
# check <- subset(elec.dat, elec.dat$votes_single_c==20)
#check[, c("cand_01_pty", "cand_02_pty", "cand_03_pty", "cand_04_pty", "cand_05_pty")]
# elec.dat[1:50, c("votes_single_LDPc", "mun_voted")]

# Calculate votes cast for single LDP cands/votes cast
elec.dat$vs_single_c <- elec.dat$votes_single_c/elec.dat$mun_voted
range(elec.dat$vs_single_c, exclude = NULL)

# Create a ranked version of this variable, rvs_single_c.
# Ranked versions are necessary because number of candidates running differs across districts.

# Take the vs_single_c scores of all the municipalities in each district year,
# rank them so that the municipality identifying the least with a single candidate receives 0
# and the municipality identifying the most with a single candidate receives 1.

elec.dat <- transform(elec.dat, rank.vs_single_c = ave(vs_single_c, district_year,
                                                       FUN = function(x) rank(x, ties.method="average")))
# Munis with highest score receive highest rank.
# If there are two munis in a district who are tied, receive the same rank.
# When there is only one municipality in the district, this variable will have 1.

#check <- subset(elec.dat, elec.dat$hor_electoral_district==1302 & elec.dat$year==1993)
#check[, c("vs_single_c", "rank.vs_single_c")]
#check <- subset(elec.dat, elec.dat$num==1)
# check$rank.vs_single_c

# Because number of municipalities differs by district, we can standardize this so that in every
# district-year, the municipality with the lowest degree of identification with a single candidate
# receives 0 and the highest receives 1.  
# This formula will do that: (absolute rank-1)/(total number of munis-1) 

rvs_single_c <- c()

for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  rel.rank <- (d$rank.vs_single_c-1)/(d$num-1)
  rvs_single_c <- c(rvs_single_c, rel.rank) 
  cat("done",j,"of",nrow(elec.dat),"\n")
}

length(rvs_single_c) == nrow(elec.dat)
elec.dat$rvs_single_c <- rvs_single_c

# Check:
check <- subset(elec.dat, elec.dat$year==1986 & elec.dat$hor_electoral_district==101)
nrow(check)
check[, c("muncode_num", "district_year", "vs_single_c", "rank.vs_single_c", "rvs_single_c")]

check <- subset(elec.dat, elec.dat$year==2014 & elec.dat$hor_electoral_district==101)
nrow(check)
check[, c("muncode_num", "district_year", "mun_voted", "votes_single_c", "vs_single_c", "rank.vs_single_c", "rvs_single_c")]

# In SUM:
# Makes: vs_single_c and rvs_single_c

names(elec.dat)










# ------------------------------------
# vs_single_LDPc
# To measure the degree to which a group strongly associates with one LDP candidate,
# Votes for a single LDP candidate/votes cast.

# In each municipality-year, get the votes of all the LDP candidates who ran, and 
# record the max.

# LDP candidates go up to 9th candidate.  After this, there are no LDP candidates.
# sum(na.omit(elec.dat$cand_09_pty=="LDP"))

votes_single_LDPc <- c()

for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  votes <- c()
  if(d$LDP_1==1){
    votes <- d$cand_01_votes
  }
  if(d$LDP_2==1){
    votes <- c(votes, d$cand_02_votes)
  }
  if(d$LDP_3==1){
    votes <- c(votes, d$cand_03_votes)
  }
  if(d$LDP_4==1){
    votes <- c(votes, d$cand_04_votes)
  }
  if(d$LDP_5==1){
    votes <- c(votes, d$cand_05_votes)
  }
  if(d$LDP_6==1){
    votes <- c(votes, d$cand_06_votes)
  }
  if(d$LDP_7==1){
    votes <- c(votes, d$cand_07_votes)
  }
  if(d$LDP_8==1){
    votes <- c(votes, d$cand_08_votes)
  }
  if(d$LDP_9==1){
    votes <- c(votes, d$cand_09_votes)
  }
  
  # If votes() doesn't have any values in it, this means the district did not have an LDP candidate.
  # Let us assign these 0:
  
  if (length(votes)>=1){
    maxvotes <- c(max(votes))} else maxvotes <- 0
  votes_single_LDPc <- c(votes_single_LDPc, maxvotes)
  
  cat("done",j,"of",nrow(elec.dat),"\n")
}

length(votes_single_LDPc)
elec.dat$votes_single_LDPc <- votes_single_LDPc                       
range(elec.dat$votes_single_LDPc, exclude = NULL)

# Calculate votes cast for single LDP cands/votes cast
elec.dat$vs_single_LDPc <- elec.dat$votes_single_LDPc/elec.dat$mun_voted
range(elec.dat$vs_single_LDPc, exclude = NULL)

#check <- subset(elec.dat, elec.dat$votes_single_LDPc==0)
#check[, c("cand_01_pty", "cand_02_pty", "cand_03_pty", "cand_04_pty", "cand_05_pty")]

#check <- subset(elec.dat, elec.dat$year==1986 & elec.dat$hor_electoral_district==101)
#nrow(check)
#check[, c("muncode_num", "district_year", "votes_single_c", "vs_single_c", "rvs_single_c", 
#                                          "votes_single_LDPc", "vs_single_LDPc")]

#check <- subset(elec.dat, elec.dat$DisLDPvotes==0)
#sum(check$vs_single_LDPc==0) == nrow(check)

# Create a ranked version of this variable, rvs_single_LDPc.
# Ranked versions are necessary because number of LDP candidates running differs across districts.

# Take the vs_single_LDPc scores of all the municipalities in each district year,
# rank them so that the municipality identifying the least with a single LDP candidate receives 0
# and the municipality identifying the most with a single LDP candidate receives 1.

elec.dat <- transform(elec.dat, rank.vs_single_LDPc = ave(vs_single_LDPc, district_year,
                                                          FUN = function(x) rank(x, ties.method="average")))

# When the district_year has no LDP candidate, vs_single_LDPc receives 0, but this method
# assigns them values. Need to re-assign them 0 or NA.

# Get districts with no LDP candidate:
year_dists <- unique(elec.dat$district_year)
all_same <- c() 
for(j in 1:length(year_dists)){
  d <- elec.dat[elec.dat$district_year==year_dists[j],]
  if(all(d$votes_single_LDPc==0)){
    all_same <- c(all_same, unique(d$district_year))
  }
}

length(all_same)

# Create a new version of the "rank" variable that receives NA when municipalities are in districts
# WITHOUT LDP winners:
elec.dat$new.rank.vs_single_LDPc <- elec.dat$rank.vs_single_LDPc
elec.dat$new.rank.vs_single_LDPc <- ifelse(elec.dat$district_year %in% all_same, NA, elec.dat$new.rank.vs_single_LDPc)

# Because the number of municipalities differs by district, we can standardize this so that in every
# district-year, the municipality with the lowest degree of identification with a single candidate
# receives 0 and the highest receives 1.  
# This formula will do that: (absolute rank-1)/(total number of munis-1) 

rvs_single_LDPc <- c()

for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  rel.rank <- (d$new.rank.vs_single_LDPc-1)/(d$num-1)
  rvs_single_LDPc <- c(rvs_single_LDPc, rel.rank) 
  cat("done",j,"of",nrow(elec.dat),"\n")
}

length(rvs_single_LDPc) == nrow(elec.dat)
elec.dat$rvs_single_LDPc <- rvs_single_LDPc

# Check:
check <- subset(elec.dat, elec.dat$year==1986 & elec.dat$hor_electoral_district==101)
nrow(check)
check[, c("muncode_num", "district_year", "vs_single_c", "rvs_single_c",
          "vs_single_LDPc", "new.rank.vs_single_LDPc", "rvs_single_LDPc")]

# Makes: vs_single_LDPc and rvs_single_LDPc

check <- subset(elec.dat, elec.dat$year==2014 & elec.dat$hor_electoral_district==101)
nrow(check)
check[, c("muncode_num", "district_year", "mun_voted", "votes_single_c", "votes_single_LDPc", "vs_single_LDPc", "new.rank.vs_single_LDPc", "rvs_single_LDPc")]

names(elec.dat)








# ------------------------------------
# vs_single_LDPw
# To measure the degree to which a group strongly associates with one LDP winner,
# Votes for a single LDP winner/votes cast.

# In each municipality-year, get the votes of all the LDP winners who ran, and 
# record the max.

elec.dat$vs_single_LDPw <- NULL

votes_single_LDPw <- c()

for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  votes <- c()
  if(d$LDP_1==1 & d$win_1==1){
    votes <- d$cand_01_votes
  }
  if(d$LDP_2==1 & d$win_2==1){
    votes <- c(votes, d$cand_02_votes)
  }
  if(d$LDP_3==1 & d$win_3==1){
    votes <- c(votes, d$cand_03_votes)
  }
  if(d$LDP_4==1 & d$win_4==1){
    votes <- c(votes, d$cand_04_votes)
  }
  if(d$LDP_5==1 & d$win_5==1){
    votes <- c(votes, d$cand_05_votes)
  }
  if(d$LDP_6==1 & d$win_6==1){
    votes <- c(votes, d$cand_06_votes)
  }
  if (length(votes) > d$totseat_in_electoral_district){
    break
  }
  # If votes() doesn't have any values in it, this means the district did not have an LDP winner
  # Let us assign these 0:
  if (length(votes)>=1){
    maxvotes <- c(max(votes))} else maxvotes <- 0
  votes_single_LDPw <- c(votes_single_LDPw, maxvotes)
  
  cat("done",j,"of",nrow(elec.dat),"\n")
}

length(votes_single_LDPw)
elec.dat$votes_single_LDPw <- votes_single_LDPw                       
range(elec.dat$votes_single_LDPw, exclude = NULL)

# check:
# After 1996, when cand_01_pty is not LDP, votes_single_LDPw should be zero
check <- subset(elec.dat, elec.dat$cand_01_pty!="LDP" & elec.dat$year>1995)
table(check$votes_single_LDPw)
# all zeros

elec.dat[1:50, c("votes_single_LDPw", "mun_voted")]

# Calculate votes cast for a single LDP winner/votes cast
elec.dat$vs_single_LDPw <- elec.dat$votes_single_LDPw/elec.dat$mun_voted
range(elec.dat$vs_single_LDPw, exclude = NULL)

# Create a ranked version of this variable, rvs_single_LDPw.

# Take the vs_single_LDPw scores of all the municipalities in each district year,
# rank them so that the municipality identifying the least with a single LDP winner receives 0
# and the municipality identifying the most with a single LDP winner receives 1.

elec.dat <- transform(elec.dat, rank.vs_single_LDPw = ave(vs_single_LDPw, district_year,
                                                          FUN = function(x) rank(x, ties.method="average")))

# When the district_year has no LDP winner, vs_single_LDPw receives 0, but this method
# assigns them values. Need to re-assign them 0 or NA.

# Get districts with no LDP winner:
year_dists <- unique(elec.dat$district_year)
all_same <- c() 
for(j in 1:length(year_dists)){
  d <- elec.dat[elec.dat$district_year==year_dists[j],]
  if(all(d$votes_single_LDPw==0)){
    all_same <- c(all_same, unique(d$district_year))
  }
}

length(all_same)
check <- subset(elec.dat, elec.dat$district_year %in% all_same)
length(unique(check$district_year))
# 798
check$rank.vs_single_LDPw # these need to be NA

# Create a new version of the "rank" variable that receives NA when municipalities are in districts
# WITHOUT LDP winners:
elec.dat$new.rank.vs_single_LDPw <- elec.dat$rank.vs_single_LDPw
elec.dat$new.rank.vs_single_LDPw <- ifelse(elec.dat$district_year %in% all_same, NA, elec.dat$new.rank.vs_single_LDPw)

sum(is.na(elec.dat$new.rank.vs_single_LDPw)) == nrow(check)

# Because the number of municipalities differs by district, we can standardize this so that in every
# district-year, the municipality with the lowest degree of identification with a single LDP winner
# receives 0 and the highest receives 1.  
# This formula will do that: (absolute rank-1)/(total number of munis-1) 

rvs_single_LDPw <- c()

for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  rel.rank <- (d$new.rank.vs_single_LDPw-1)/(d$num-1)
  rvs_single_LDPw <- c(rvs_single_LDPw, rel.rank) 
  cat("done",j,"of",nrow(elec.dat),"\n")
}

length(rvs_single_LDPw) == nrow(elec.dat)
elec.dat$rvs_single_LDPw <- rvs_single_LDPw

# Check:
check <- subset(elec.dat, elec.dat$year==1986 & elec.dat$hor_electoral_district==101)
nrow(check)
check[, c("muncode_num", "mun_voted",  
          "votes_single_LDPw", "vs_single_LDPw", "new.rank.vs_single_LDPw", "rvs_single_LDPw")]

check <- subset(elec.dat, elec.dat$year==2014 & elec.dat$hor_electoral_district==101)
nrow(check)
check[, c("muncode_num", "district_year", "mun_voted", "votes_single_c", "votes_single_LDPc", 
          "votes_single_LDPw", "vs_single_LDPw", "new.rank.vs_single_LDPw", "rvs_single_LDPw")]

# Makes: vs_single_LDPw and rvs_single_LDPw








# ---------------------------------------------
# MERGE THEM BACK ONTO DAT

# Reduce elec.dat to new variables:

myvars <- c("id", "ldpsenior",
            "votes_single_c", "vs_single_c", "rank.vs_single_c", "rvs_single_c",
            "votes_single_LDPc", "vs_single_LDPc", "new.rank.vs_single_LDPc", "rvs_single_LDPc",
            "votes_single_LDPw", "vs_single_LDPw", "new.rank.vs_single_LDPw", "rvs_single_LDPw")

elec.dat2 <- elec.dat[myvars]

dat1 <- merge(dat, elec.dat2, by="id", all=T)

saveRDS(dat1, file = "Master_plus_Snow_Turn_Trans_Dis3.rds")









# ---------------------------------------
# PART 6:

# Add variables we made in Book_Turnout_Analysis.R
# and Book_Mun_Level_Transfers_Analysis.PRE.R
#
# ----------------------------------------

dat <- readRDS("Master_plus_Snow_Turn_Trans_Dis3.rds")




# ------------------------------------
elec.dat <- dat[!is.na(dat$hor_electoral_district),]


# -------------------------------
# make votes_single_nonLDPc

# votes for a single non-LDP candidate/votes cast

# Can use elec.dat

votes_single_nonLDPc <- c()

for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  votes <- c()
  if(d$LDP_1==0){
    votes <- d$cand_01_votes
  }
  if(d$LDP_2==0){
    votes <- c(votes, d$cand_02_votes)
  }
  if(d$LDP_3==0){
    votes <- c(votes, d$cand_03_votes)
  }
  if(d$LDP_4==0){
    votes <- c(votes, d$cand_04_votes)
  }
  if(d$LDP_5==0){
    votes <- c(votes, d$cand_05_votes)
  }
  if(d$LDP_6==0){
    votes <- c(votes, d$cand_06_votes)
  }
  if(d$LDP_7==0){
    votes <- c(votes, d$cand_07_votes)
  }
  if(d$LDP_8==0){
    votes <- c(votes, d$cand_08_votes)
  }
  if(d$LDP_9==0){
    votes <- c(votes, d$cand_09_votes)
  }
  if(d$LDP_10==0){
    votes <- c(votes, d$cand_10_votes)
  }
  if(d$LDP_11==0){
    votes <- c(votes, d$cand_11_votes)
  }
  if(d$LDP_12==0){
    votes <- c(votes, d$cand_12_votes)
  }
  if(d$LDP_13==0){
    votes <- c(votes, d$cand_13_votes)
  }
  if(d$LDP_14==0){
    votes <- c(votes, d$cand_14_votes)
  }
  if(d$LDP_15==0){
    votes <- c(votes, d$cand_15_votes)
  }
  if(d$LDP_16==0){
    votes <- c(votes, d$cand_16_votes)
  }
  if(d$LDP_17==0){
    votes <- c(votes, d$cand_17_votes)
  }
  if (length(votes)>=1){
    maxvotes <- c(max(votes))} else maxvotes <- 0
  votes_single_nonLDPc <- c(votes_single_nonLDPc, maxvotes)
  
  cat("done",j,"of",nrow(elec.dat),"\n")
}

length(votes_single_nonLDPc)
elec.dat$votes_single_nonLDPc <- votes_single_nonLDPc                       
range(elec.dat$votes_single_LDPc, exclude = NULL)

elec.dat[1:50, c("votes_all_LDPc", "votes_single_nonLDPc", "mun_voted")]

# Calculate votes cast for single non LDP cands/votes cast
elec.dat$vs_single_nonLDPc <- elec.dat$votes_single_nonLDPc/elec.dat$mun_voted
range(elec.dat$vs_single_nonLDPc, exclude = NULL)







# -------------------------------
# make votes_single_LDPl
# -------------------------------
# votes for a single LDP loser/votes cast

# In each municipality-year, identify the largest number of votes that municipality
# supplied to an LDP loser.  Get the LDP winners in the district and the votes
# they received in the municipality, and record the maximum of this.

votes_single_LDPl <- c()

for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  votes <- c()
  if(d$LDP_2==1 & d$win_2==0){
    votes <- c(votes, d$cand_02_votes)
  }
  if(d$LDP_3==1 & d$win_3==0){
    votes <- c(votes, d$cand_03_votes)
  }
  if(d$LDP_4==1 & d$win_4==0){
    votes <- c(votes, d$cand_04_votes)
  }
  if(d$LDP_5==1 & d$win_5==0){
    votes <- c(votes, d$cand_05_votes)
  }
  if(d$LDP_6==1 & d$win_6==0){
    votes <- c(votes, d$cand_06_votes)
  }
  if(d$LDP_7==1 & d$win_7==0){
    votes <- c(votes, d$cand_07_votes)
  }
  if(d$LDP_8==1 & d$win_8==0){
    votes <- c(votes, d$cand_08_votes)
  }
  if(d$LDP_9==1 & d$win_9==0){
    votes <- c(votes, d$cand_09_votes)
  }
  
  if (length(votes) > d$totseat_in_electoral_district){
    break
  }
  if (length(votes)>=1){
    maxvotes <- c(max(votes))} else maxvotes <- 0
  votes_single_LDPl <- c(votes_single_LDPl, maxvotes)
  
  cat("done",j,"of",nrow(elec.dat),"\n")
}

length(votes_single_LDPl)
elec.dat$votes_single_LDPl <- votes_single_LDPl                     
range(elec.dat$votes_single_LDPl, exclude = NULL)

check <- subset(elec.dat, elec.dat$year>1993 & (elec.dat$cand_01_pty=="LDP"))
sum(check$votes_single_LDPl==0)

elec.dat[1:50, c("votes_single_LDPl", "mun_voted")]

# Calculate votes cast for a single LDP loser/votes cast
elec.dat$vs_single_LDPl <- elec.dat$votes_single_LDPl/elec.dat$mun_voted
range(elec.dat$vs_single_LDPl, exclude = NULL)





# -------------------------------
# votes for a single non-LDP winner/votes cast

# In each municipality-year, identify the largest number of votes that municipality
# supplied to a non-LDP winner.  Get the non LDP winners in the district and the votes
# they received in the municipality, and record the maximum of this.

votes_single_nonLDPw <- c()

for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  votes <- c()
  if(d$LDP_1==0 & d$win_1==1){
    votes <- d$cand_01_votes
  }
  if(d$LDP_2==0 & d$win_2==1){
    votes <- c(votes, d$cand_02_votes)
  }
  if(d$LDP_3==0 & d$win_3==1){
    votes <- c(votes, d$cand_03_votes)
  }
  if(d$LDP_4==0 & d$win_4==1){
    votes <- c(votes, d$cand_04_votes)
  }
  if(d$LDP_5==0 & d$win_5==1){
    votes <- c(votes, d$cand_05_votes)
  }
  if(d$LDP_6==0 & d$win_6==1){
    votes <- c(votes, d$cand_06_votes)
  }
  if (length(votes) > d$totseat_in_electoral_district){
    break
  }
  if (length(votes)>=1){
    maxvotes <- c(max(votes))} else maxvotes <- 0
  votes_single_nonLDPw <- c(votes_single_nonLDPw, maxvotes)
  
  #cat("done",j,"of",nrow(elec.dat),"\n")
}
length(votes_single_nonLDPw)
elec.dat$votes_single_nonLDPw <- votes_single_nonLDPw                       
range(elec.dat$votes_single_nonLDPw, exclude = NULL)

# check:
# After 1996, when cand_01_pty is not LDP, votes_single_nonLDPw should have no 0s.
check <- subset(elec.dat, elec.dat$cand_01_pty!="LDP" & elec.dat$year>1995)
sum(check$votes_single_nonLDPw==0)

elec.dat[1:50, c("votes_single_nonLDPw", "mun_voted")]

# Calculate votes cast for a single non-LDP winner/votes cast
elec.dat$vs_single_nonLDPw <- elec.dat$votes_single_nonLDPw/elec.dat$mun_voted
range(elec.dat$vs_single_nonLDPw, exclude = NULL)





# -------------------------------
# votes for a single non-LDPI winner/votes cast

# In each municipality-year, identify the largest number of votes that municipality
# supplied to a non-LDP winner.  Get the non LDP winners in the district and the votes
# they received in the municipality, and record the maximum of this.

votes_single_nonLDPIw <- c()

for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  votes <- c()
  if(d$LDP_1plus==0 & d$win_1==1){
    votes <- d$cand_01_votes
  }
  if(d$LDP_2plus==0 & d$win_2==1){
    votes <- c(votes, d$cand_02_votes)
  }
  if(d$LDP_3plus==0 & d$win_3==1){
    votes <- c(votes, d$cand_03_votes)
  }
  if(d$LDP_4plus==0 & d$win_4==1){
    votes <- c(votes, d$cand_04_votes)
  }
  if(d$LDP_5plus==0 & d$win_5==1){
    votes <- c(votes, d$cand_05_votes)
  }
  if(d$LDP_6plus==0 & d$win_6==1){
    votes <- c(votes, d$cand_06_votes)
  }
  if (length(votes) > d$totseat_in_electoral_district){
    break
  }
  if (length(votes)>=1){
    maxvotes <- c(max(votes))} else maxvotes <- 0
  votes_single_nonLDPIw <- c(votes_single_nonLDPIw, maxvotes)
  
  #cat("done",j,"of",nrow(elec.dat),"\n")
}

length(votes_single_nonLDPIw)
elec.dat$votes_single_nonLDPIw <- votes_single_nonLDPIw                       
range(elec.dat$votes_single_nonLDPIw, exclude = NULL)

# check:
# After 1996, when cand_01_pty is not LDP, votes_single_nonLDPw should have no 0s.
check <- subset(elec.dat, !(elec.dat$cand_01_pty=="LDP"|elec.dat$cand_01_pty=="LDPI") & elec.dat$year>1995)
sum(check$votes_single_nonLDPIw==0)

elec.dat[1:50, c("votes_single_nonLDPIw", "mun_voted")]

# Calculate votes cast for a single non-LDPI winner/votes cast
elec.dat$vs_single_nonLDPIw <- elec.dat$votes_single_nonLDPIw/elec.dat$mun_voted
range(elec.dat$vs_single_nonLDPIw, exclude = NULL)








# -------------------------------
# make votes_single_oppw
# -------------------------------
# votes for a single opposition winner/votes cast

# Only need to examine candidate 1 as others are losers

# 1996 DPJ or NFP
# 2000 DPJ
# 2003 DPJ
# 2005 DPJ
# 2009 DPJ
# 2012: DPJ or Ishin
# 2014: DPJ or Ishin

# Candidate 1:
NFP_1 <- c()
DPJ_1 <- c()
JRP_1 <- c()

for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  NFP <- ifelse(d$cand_01_pty == "NFP", 1, 0) 
  DPJ <- ifelse(d$cand_01_pty == "DPJ", 1, 0) 
  JRP <- ifelse(d$cand_01_pty == "JRP", 1, 0) 
  NFP_1 <- c(NFP_1, NFP)
  DPJ_1 <- c(DPJ_1, DPJ)
  JRP_1 <- c(JRP_1, JRP)
  cat("done",j,"of",nrow(elec.dat),"\n")
  rm(NFP, DPJ, JRP)
}

nrow(elec.dat)
length(DPJ_1)
length(NFP_1)
length(JRP_1)

elec.dat$NFP_1 <- NFP_1
elec.dat$DPJ_1 <- DPJ_1
elec.dat$JRP_1 <- JRP_1



# -------------------------------
# votes for a single DPJ winner/votes cast

# In each municipality-year, identify the largest number of votes that municipality
# supplied to an LDP winner.  Get the LDP winners in the district and the votes
# they received in the municipality, and record the maximum of this.

votes_single_DPJw <- c()

for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  votes <- c()
  if(d$DPJ_1==1){
    votes <- d$cand_01_votes
  }  else votes <- 0
  votes_single_DPJw <- c(votes_single_DPJw, votes)
  
  cat("done",j,"of",nrow(elec.dat),"\n")
}

length(votes_single_DPJw)
elec.dat$votes_single_DPJw <- votes_single_DPJw                       
range(elec.dat$votes_single_DPJw, exclude = NULL)

# check:
# After 1996, when cand_01_pty is not LDP, votes_single_LDPw should be zero
check <- subset(elec.dat, elec.dat$cand_01_pty=="LDP" & elec.dat$year>1995)
table(check$votes_single_DPJw)
# all zeros

elec.dat[1:50, c("votes_single_DPJw", "mun_voted")]

# Calculate votes cast for a single DPJ winner/votes cast
elec.dat$vs_single_DPJw <- elec.dat$votes_single_DPJw/elec.dat$mun_voted
range(elec.dat$vs_single_DPJw, exclude = NULL)



# -------------------------------
# votes for a single NFP winner/votes cast

# In each municipality-year, identify the largest number of votes that municipality
# supplied to an LDP winner.  Get the LDP winners in the district and the votes
# they received in the municipality, and record the maximum of this.

votes_single_NFPw <- c()

for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  votes <- c()
  if(d$NFP_1==1){
    votes <- d$cand_01_votes
  }  else votes <- 0
  votes_single_NFPw <- c(votes_single_NFPw, votes)
  
  cat("done",j,"of",nrow(elec.dat),"\n")
}

length(votes_single_NFPw)
elec.dat$votes_single_NFPw <- votes_single_NFPw                       
range(elec.dat$votes_single_NFPw, exclude = NULL)

# check:
# After 1996, when cand_01_pty is not LDP, votes_single_LDPw should be zero
check <- subset(elec.dat, elec.dat$cand_01_pty=="LDP" & elec.dat$year>1995)
table(check$votes_single_NFPw)
# all zeros

elec.dat[1:50, c("votes_single_NFPw", "mun_voted")]

# Calculate votes cast for a single NFP winner/votes cast
elec.dat$vs_single_NFPw <- elec.dat$votes_single_NFPw/elec.dat$mun_voted
range(elec.dat$vs_single_NFPw, exclude = NULL)



# -------------------------------
# votes for a single JRP winner/votes cast

# In each municipality-year, identify the largest number of votes that municipality
# supplied to an LDP winner.  Get the LDP winners in the district and the votes
# they received in the municipality, and record the maximum of this.

votes_single_JRPw <- c()

for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  votes <- c()
  if(d$JRP_1==1){
    votes <- d$cand_01_votes
  }  else votes <- 0
  votes_single_JRPw <- c(votes_single_JRPw, votes)
  
  cat("done",j,"of",nrow(elec.dat),"\n")
}

length(votes_single_JRPw)
elec.dat$votes_single_JRPw <- votes_single_JRPw                       
range(elec.dat$votes_single_JRPw, exclude = NULL)

# check:
# After 1996, when cand_01_pty is not LDP, votes_single_LDPw should be zero
check <- subset(elec.dat, elec.dat$cand_01_pty=="LDP" & elec.dat$year>1995)
table(check$votes_single_JRPw)
# all zeros

elec.dat[1:50, c("votes_single_JRPw", "mun_voted")]

# Calculate votes cast for a single JRP winner/votes cast
elec.dat$vs_single_JRPw <- elec.dat$votes_single_JRPw/elec.dat$mun_voted
range(elec.dat$vs_single_JRPw, exclude = NULL)







# -----------------------------------------------------
# Create ranked versions of vs_single_DPJw, vs_single_NFPw, vs_single_JRPw

# -----------------
# vs_single_DPJw first

elec.dat <- transform(elec.dat, rank.vs_single_DPJw = ave(vs_single_DPJw, district_year,
                                                          FUN = function(x) rank(x, ties.method="average")))

# When the district_year has no DPJ winner, vs_single_DPJw receives 0, but this method
# assigns them values. Need to re-assign them 0 or NA.

# Get districts with no DPJ winner:
year_dists <- unique(elec.dat$district_year)
all_same <- c() 
for(j in 1:length(year_dists)){
  d <- elec.dat[elec.dat$district_year==year_dists[j],]
  if(all(d$votes_single_DPJw==0)){
    all_same <- c(all_same, unique(d$district_year))
  }
}

length(all_same)
check <- subset(elec.dat, elec.dat$district_year %in% all_same)
length(unique(check$district_year))
check$rank.vs_single_DPJw # these need to be NA

# Create a new version of the "rank" variable that receives NA when municipalities are in districts
# WITHOUT DPJ winners:
elec.dat$new.rank.vs_single_DPJw <- elec.dat$rank.vs_single_DPJw
elec.dat$new.rank.vs_single_DPJw <- ifelse(elec.dat$district_year %in% all_same, NA, elec.dat$new.rank.vs_single_DPJw)

sum(is.na(elec.dat$new.rank.vs_single_DPJw)) == nrow(check)

# Because the number of municipalities differs by district, we can standardize this so that in every
# district-year, the municipality with the lowest degree of identification with a single DPJ winner
# receives 0 and the highest receives 1.  
# This formula will do that: (absolute rank-1)/(total number of munis-1) 

rvs_single_DPJw <- c()

for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  rel.rank <- (d$new.rank.vs_single_DPJw-1)/(d$num-1)
  rvs_single_DPJw <- c(rvs_single_DPJw, rel.rank) 
  cat("done",j,"of",nrow(elec.dat),"\n")
}

length(rvs_single_DPJw) == nrow(elec.dat)
elec.dat$rvs_single_DPJw <- rvs_single_DPJw

# Check:
check <- subset(elec.dat, elec.dat$year==2014 & elec.dat$hor_electoral_district==101)
nrow(check)
check[, c("muncode_num", "district_year", "mun_voted", 
          "vs_single_DPJw",  "rvs_single_DPJw")]











# -------------------------------
# we have resurrectSSD, KOMSSD.  Make indicator for an electoral
# district with a DPJ winner who is former LDP and an indicator for an electoral
# district with a DPJ winner who is former JSP.
# -------------------------------





# ----------------------------------
# Lets read in Reed-Smith data and get DPJ winners in every election

dta <- read.csv("Reed-Smith-JHRED.csv", fileEncoding = "SHIFT-JIS")

# Get DPJ SSD winners only
dta2 <- subset(dta, dta$year>1993 & dta$byelection==0 & party_en=="DPJ" & result==1)
table(dta2$kucode)







# -------------------------------------------
# Get 1996 DPJ winners:
dpjwinners.96 <- subset(dta2, dta2$year==1996)
cands.96 <- dpjwinners.96$name_jp
length(cands.96)
# 17 DPJ winners

# Get all cands in elections prior to 1996 that were NOT DPJ
no.dpj <- subset(dta, dta$year<1996 & dta$byelection==0 & party_en!="DPJ")

# For each cand in cands.96, did he contest a previous election from another party?
# If he did, record the election, the party, and the district:

tops <- matrix(as.character(NA), ncol=4, nrow=0)

for(i in 1:length(cands.96)){
  
  # Is that candidate present in an earlier year?
  if(cands.96[i] %in% no.dpj$name_jp){
    
    # If so, get data on just that candidate:
    pres <- no.dpj[no.dpj$name_jp==cands.96[i],]
    
    # For each year the candidate ran, extract the candidate's year, party, and district:
    pres.years <- unique(pres$year)
    
    tops2 <- matrix(as.character(NA), ncol=4, nrow=0)
    
    for(i in 1:length(pres.years)){
      sub.pres <- pres[pres$year==pres.years[i],]
      cand <- sub.pres$name_jp
      year <- sub.pres$year
      pty <- sub.pres$party_en
      kucode <- sub.pres$kucode
      tops2 <- rbind(tops2, cbind(as.character(cand), year, pty, kucode)) 
    }
    
    tops <- rbind(tops, tops2)
  } else next
}    

tops3 <- as.data.frame(tops)
colnames(tops3)[1] <- "cand"
length(unique(tops3$cand))
# 14
tops.96 <- tops3
rm(tops, tops2, tops3)








# -------------------------------------------
# Get 2000 DPJ winners:
dpjwinners.00 <- subset(dta2, dta2$year==2000)
cands.00 <- dpjwinners.00$name_jp
length(cands.00)
# 80 DPJ winners

# Get all winners in elections prior to 2000 that were NOT DPJ
no.dpj <- subset(dta, dta$year<2000 & dta$byelection==0 & party_en!="DPJ")

# For each cand in cands.00, did he win a previous election from another party?
# If he did, record the election, the party, and the district:

tops <- matrix(as.character(NA), ncol=4, nrow=0)

for(i in 1:length(cands.00)){
  
  # Is that candidate present in an earlier year?
  if(cands.00[i] %in% no.dpj$name_jp){
    
    # If so, get data on just that candidate:
    pres <- no.dpj[no.dpj$name_jp==cands.00[i],]
    
    # For each year the candidate ran, extract the candidate's year, party, and district:
    pres.years <- unique(pres$year)
    
    tops2 <- matrix(as.character(NA), ncol=4, nrow=0)
    
    for(i in 1: length(pres.years)){
      sub.pres <- pres[pres$year==pres.years[i],]
      cand <- sub.pres$name_jp
      year <- sub.pres$year
      pty <- sub.pres$party_en
      kucode <- sub.pres$kucode
      tops2 <- rbind(tops2, cbind(as.character(cand), year, pty, kucode)) 
    }
    
    tops <- rbind(tops, tops2)
  } else next
}    

tops3 <- as.data.frame(tops)
colnames(tops3)[1] <- "cand"
length(unique(tops3$cand))
# 53

tops.00 <- tops3
rm(tops, tops2, tops3)







# -------------------------------------------
# Get 2003 DPJ winners:
dpjwinners.03 <- subset(dta2, dta2$year==2003)
cands.03 <- dpjwinners.03$name_jp
length(cands.03)
# 105

# Get all winners in elections prior to 2003 that were NOT DPJ
no.dpj <- subset(dta, dta$year<2003 & dta$byelection==0 & party_en!="DPJ")

# For each cand in cands.03, did he win a previous election from another party?
# If he did, record the election, the party, and the district:

tops <- matrix(as.character(NA), ncol=4, nrow=0)

for(i in 1:length(cands.03)){
  
  # Is that candidate present in an earlier year?
  if(cands.03[i] %in% no.dpj$name_jp){
    
    # If so, get data on just that candidate:
    pres <- no.dpj[no.dpj$name_jp==cands.03[i],]
    
    # For each year the candidate ran, extract the candidate's year, party, and district:
    pres.years <- unique(pres$year)
    
    tops2 <- matrix(as.character(NA), ncol=4, nrow=0)
    
    for(i in 1: length(pres.years)){
      sub.pres <- pres[pres$year==pres.years[i],]
      cand <- sub.pres$name_jp
      year <- sub.pres$year
      pty <- sub.pres$party_en
      kucode <- sub.pres$kucode
      tops2 <- rbind(tops2, cbind(as.character(cand), year, pty, kucode)) 
    }
    
    tops <- rbind(tops, tops2)
  } else next
}    

tops3 <- as.data.frame(tops)
colnames(tops3)[1] <- "cand"
length(unique(tops3$cand))
# 57
tops.03 <- tops3
rm(tops, tops2, tops3)




# -------------------------------------------
# Get 2005 DPJ winners:
dpjwinners.05 <- subset(dta2, dta2$year==2005)
cands.05 <- dpjwinners.05$name_jp
length(cands.05)
# 52

# Get all winners in elections prior to 2005 that were NOT DPJ
no.dpj <- subset(dta, dta$year<2005 & dta$byelection==0 & party_en!="DPJ")

# For each cand in cands.05, did he win a previous election from another party?
# If he did, record the election, the party, and the district:

tops <- matrix(as.character(NA), ncol=4, nrow=0)

for(i in 1:length(cands.05)){
  
  # Is that candidate present in an earlier year?
  if(cands.05[i] %in% no.dpj$name_jp){
    
    # If so, get data on just that candidate:
    pres <- no.dpj[no.dpj$name_jp==cands.05[i],]
    
    # For each year the candidate ran, extract the candidate's year, party, and district:
    pres.years <- unique(pres$year)
    
    tops2 <- matrix(as.character(NA), ncol=4, nrow=0)
    
    for(i in 1: length(pres.years)){
      sub.pres <- pres[pres$year==pres.years[i],]
      cand <- sub.pres$name_jp
      year <- sub.pres$year
      pty <- sub.pres$party_en
      kucode <- sub.pres$kucode
      tops2 <- rbind(tops2, cbind(as.character(cand), year, pty, kucode)) 
    }
    
    tops <- rbind(tops, tops2)
  } else next
}    

tops3 <- as.data.frame(tops)
colnames(tops3)[1] <- "cand"
length(unique(tops3$cand))
# 30
tops.05 <- tops3
rm(tops, tops2, tops3)
table(tops.05$year)





# -------------------------------------------
# Get 2009 DPJ winners:
dpjwinners.09 <- subset(dta2, dta2$year==2009)
cands.09 <- dpjwinners.09$name_jp
length(cands.09)
# 221

# Get all winners in elections prior to 2005 that were NOT DPJ
no.dpj <- subset(dta, dta$year<2009 & dta$byelection==0 & party_en!="DPJ")

# For each cand in cands.09, did he win a previous election from another party?
# If he did, record the election, the party, and the district:

tops <- matrix(as.character(NA), ncol=4, nrow=0)

for(i in 1:length(cands.09)){
  
  # Is that candidate present in an earlier year?
  if(cands.09[i] %in% no.dpj$name_jp){
    
    # If so, get data on just that candidate:
    pres <- no.dpj[no.dpj$name_jp==cands.09[i],]
    
    # For each year the candidate ran, extract the candidate's year, party, and district:
    pres.years <- unique(pres$year)
    
    tops2 <- matrix(as.character(NA), ncol=4, nrow=0)
    
    for(i in 1: length(pres.years)){
      sub.pres <- pres[pres$year==pres.years[i],]
      cand <- sub.pres$name_jp
      year <- sub.pres$year
      pty <- sub.pres$party_en
      kucode <- sub.pres$kucode
      tops2 <- rbind(tops2, cbind(as.character(cand), year, pty, kucode)) 
    }
    
    tops <- rbind(tops, tops2)
  } else next
}    

tops3 <- as.data.frame(tops)
colnames(tops3)[1] <- "cand"
length(unique(tops3$cand))
# 74
tops.09 <- tops3
rm(tops, tops2, tops3)

table(tops.09$year)





# -------------------------------------------
# Get 2012 DPJ winners:
dpjwinners.12 <- subset(dta2, dta2$year==2012)
cands.12 <- dpjwinners.12$name_jp
length(cands.12)
# 27

# Get all winners in elections prior to 2005 that were NOT DPJ
no.dpj <- subset(dta, dta$year<2012 & dta$byelection==0 & party_en!="DPJ")

# For each cand in cands.12, did he win a previous election from another party?
# If he did, record the election, the party, and the district:

tops <- matrix(as.character(NA), ncol=4, nrow=0)

for(i in 1:length(cands.12)){
  
  # Is that candidate present in an earlier year?
  if(cands.12[i] %in% no.dpj$name_jp){
    
    # If so, get data on just that candidate:
    pres <- no.dpj[no.dpj$name_jp==cands.12[i],]
    
    # For each year the candidate ran, extract the candidate's year, party, and district:
    pres.years <- unique(pres$year)
    
    tops2 <- matrix(as.character(NA), ncol=4, nrow=0)
    
    for(i in 1: length(pres.years)){
      sub.pres <- pres[pres$year==pres.years[i],]
      cand <- sub.pres$name_jp
      year <- sub.pres$year
      pty <- sub.pres$party_en
      kucode <- sub.pres$kucode
      tops2 <- rbind(tops2, cbind(as.character(cand), year, pty, kucode)) 
    }
    
    tops <- rbind(tops, tops2)
  } else next
}    

tops3 <- as.data.frame(tops)
colnames(tops3)[1] <- "cand"
length(unique(tops3$cand))
# 12

tops.12 <- tops3
rm(tops, tops2, tops3)
table(tops.12$year)




# -------------------------------------------
# Get 2014 DPJ winners:
dpjwinners.14 <- subset(dta2, dta2$year==2014)
cands.14 <- dpjwinners.14$name_jp
length(cands.14)
# 38

# Get all winners in elections prior to 2014 that were NOT DPJ
no.dpj <- subset(dta, dta$year<2014 & dta$byelection==0 & party_en!="DPJ")

# For each cand in cands.14, did he win a previous election from another party?
# If he did, record the election, the party, and the district:

tops <- matrix(as.character(NA), ncol=4, nrow=0)

for(i in 1:length(cands.14)){
  
  # Is that candidate present in an earlier year?
  if(cands.14[i] %in% no.dpj$name_jp){
    
    # If so, get data on just that candidate:
    pres <- no.dpj[no.dpj$name_jp==cands.14[i],]
    
    # For each year the candidate ran, extract the candidate's year, party, and district:
    pres.years <- unique(pres$year)
    
    tops2 <- matrix(as.character(NA), ncol=4, nrow=0)
    
    for(i in 1: length(pres.years)){
      sub.pres <- pres[pres$year==pres.years[i],]
      cand <- sub.pres$name_jp
      year <- sub.pres$year
      pty <- sub.pres$party_en
      kucode <- sub.pres$kucode
      tops2 <- rbind(tops2, cbind(as.character(cand), year, pty, kucode)) 
    }
    
    tops <- rbind(tops, tops2)
  } else next
}    

tops3 <- as.data.frame(tops)
colnames(tops3)[1] <- "cand"
length(unique(tops3$cand))
# 16
tops.14 <- tops3
rm(tops, tops2, tops3)
table(tops.14$year)








# ------------------------------------------------
# Of the DPJ winners in each year, if the DPJ candidate has ever run as an LDP
# candidate, extract the year_district

# tops.96 matrices contain name of DPJ winners in 1996 who previously ran as a candidate for
# another party, and the party name, year, and kucode in which they ran.

# Of 1996 DPJ winners, who ran as former LDP?
formerLDP.96 <- subset(tops.96, tops.96$pty=="LDP")
cands.96 <- unique(formerLDP.96$cand)

# Of 2000 DPJ winners, who ran as former LDP?
formerLDP.00 <- subset(tops.00, tops.00$pty=="LDP")
cands.00 <- unique(formerLDP.00$cand)

# Of 2003 DPJ winners, who ran as former LDP?
formerLDP.03 <- subset(tops.03, tops.03$pty=="LDP")
cands.03 <- unique(formerLDP.03$cand)

# Of 2005 DPJ winners, who ran as former LDP?
formerLDP.05 <- subset(tops.05, tops.05$pty=="LDP")
cands.05 <- unique(formerLDP.05$cand)

# Of 2009 DPJ winners, who ran as former LDP?
formerLDP.09 <- subset(tops.09, tops.09$pty=="LDP")
cands.09 <- unique(formerLDP.09$cand)

# Of 2012 DPJ winners, who ran as former LDP?
formerLDP.12 <- subset(tops.12, tops.12$pty=="LDP")
cands.12 <- unique(formerLDP.12$cand)

# Of 2014 DPJ winners, who ran as former LDP?
formerLDP.14 <- subset(tops.14, tops.14$pty=="LDP")
cands.14 <- unique(formerLDP.14$cand)




# ------------------------------------------------
# cands.96 etc contain DPJ winners in each year who were former LDP politicians.

district_years.96 <- c()
for(i in 1:length(cands.96)){
  pres <- dta2[dta2$year==1996 & dta2$name_jp==cands.96[i],]
  district_year <- paste(pres$year, pres$kucode, sep="_")
  district_years.96 <- c(district_years.96, district_year)
}

district_years.00 <- c()
for(i in 1:length(cands.00)){
  pres <- dta2[dta2$year==2000 & dta2$name_jp==cands.00[i],]
  district_year <- paste(pres$year, pres$kucode, sep="_")
  district_years.00 <- c(district_years.00, district_year)
}

district_years.03 <- c()
for(i in 1:length(cands.03)){
  pres <- dta2[dta2$year==2003 & dta2$name_jp==cands.03[i],]
  district_year <- paste(pres$year, pres$kucode, sep="_")
  district_years.03 <- c(district_years.03, district_year)
}

district_years.05 <- c()
for(i in 1:length(cands.05)){
  pres <- dta2[dta2$year==2005 & dta2$name_jp==cands.05[i],]
  district_year <- paste(pres$year, pres$kucode, sep="_")
  district_years.05 <- c(district_years.05, district_year)
}

district_years.09 <- c()
for(i in 1:length(cands.09)){
  pres <- dta2[dta2$year==2009 & dta2$name_jp==cands.09[i],]
  district_year <- paste(pres$year, pres$kucode, sep="_")
  district_years.09 <- c(district_years.09, district_year)
}

district_years.12 <- c()
for(i in 1:length(cands.12)){
  pres <- dta2[dta2$year==2012 & dta2$name_jp==cands.12[i],]
  district_year <- paste(pres$year, pres$kucode, sep="_")
  district_years.12 <- c(district_years.12, district_year)
}

district_years.14 <- c()
for(i in 1:length(cands.14)){
  pres <- dta2[dta2$year==2014 & dta2$name_jp==cands.14[i],]
  district_year <- paste(pres$year, pres$kucode, sep="_")
  district_years.14 <- c(district_years.14, district_year)
}

d_years <- c(district_years.96, district_years.00,
             district_years.03, district_years.05, 
             district_years.09, district_years.12, district_years.14)


# d_years contains district_years with DPJ winners who are former LDP candidates

# Attach onto elec.dat

sum(d_years %in% elec.dat$district_year)

elec.dat$formerLDP <- ifelse(elec.dat$district_year %in% d_years, 1, 0)










# -----------------------------------------------------------------------
# add variables we made for transfer analyses

# Votes for the LDP/voting pop
elec.dat$VotesLDP_vp <- elec.dat$VotesLDP/elec.dat$mun_voting_pop

# -----------------------------------
# Make a VotesDPJ_vp variable

DPJ_1 <- c()
for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  DPJ <- ifelse(d$cand_01_pty == "DPJ", 1, 0) 
  DPJ_1 <- c(DPJ_1, DPJ)
  rm(DPJ)
}
elec.dat$DPJ_1 <- DPJ_1

DPJ_2 <- c()
for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  DPJ <- ifelse(d$cand_02_pty == "DPJ", 1, 0) 
  DPJ_2 <- c(DPJ_2, DPJ)
  rm(DPJ)
}
elec.dat$DPJ_2 <- DPJ_2

DPJ_3 <- c()
for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  DPJ <- ifelse(d$cand_03_pty == "DPJ", 1, 0) 
  DPJ_3 <- c(DPJ_3, DPJ)
  rm(DPJ)
}
elec.dat$DPJ_3 <- DPJ_3

DPJ_4 <- c()
for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  DPJ <- ifelse(d$cand_04_pty == "DPJ", 1, 0) 
  DPJ_4 <- c(DPJ_4, DPJ)
  rm(DPJ)
}
elec.dat$DPJ_4 <- DPJ_4

DPJ_5 <- c()
for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  DPJ <- ifelse(d$cand_05_pty == "DPJ", 1, 0) 
  DPJ_5 <- c(DPJ_5, DPJ)
  rm(DPJ)
}
elec.dat$DPJ_5 <- DPJ_5

DPJ_6 <- c()
for(j in 1:nrow(elec.dat)){
  d <- elec.dat[j,]
  DPJ <- ifelse(d$cand_06_pty == "DPJ", 1, 0) 
  DPJ_6 <- c(DPJ_6, DPJ)
  rm(DPJ)
}
elec.dat$DPJ_6 <- DPJ_6

votesDPJ <- c()

for(i in 1:nrow(elec.dat)){
  dset <- elec.dat[i,]
  votes <- sum(dset$DPJ_1*dset$cand_01_votes,
               dset$DPJ_2*dset$cand_02_votes, 
               dset$DPJ_3*dset$cand_03_votes, 
               dset$DPJ_4*dset$cand_04_votes,  
               dset$DPJ_5*dset$cand_05_votes, 
               dset$DPJ_6*dset$cand_06_votes, na.rm=T)
  votesDPJ <- c(votesDPJ, votes)
}

length(votesDPJ)
elec.dat$VotesDPJ <- votesDPJ                       
range(elec.dat$VotesDPJ, exclude = NULL)

# Votes for the DPJ/voting pop
elec.dat$VotesDPJ_vp <- elec.dat$VotesDPJ/elec.dat$mun_voting_pop

# make sumDPJ_vsharevp

elec.dat$sumDPJ_vsharevp <- ifelse(elec.dat$cand_01_pty=="DPJ", 
                                  elec.dat$sumDPJ_vsharevp <- elec.dat$VotesDPJ_vp, 
                                  elec.dat$sumDPJ_vsharevp <- 0)

elec.dat[1:100, c("cand_01_pty", "VotesDPJ_vp", "sumDPJ_vsharevp")]






# ---------------------------------------------
# MERGE THEM BACK ONTO DAT

# Reduce elec.dat to new variables:

myvars <- c("id", "votes_single_nonLDPc",             "vs_single_nonLDPc" ,         "votes_single_LDPl",               
            "vs_single_LDPl",                   "votes_single_nonLDPw",             "vs_single_nonLDPw",               
             "votes_single_nonLDPIw",            "vs_single_nonLDPIw" ,              "NFP_1"     ,                      
             "DPJ_1" ,                           "JRP_1"        ,                    "votes_single_DPJw" ,              
            "vs_single_DPJw",                   "votes_single_NFPw" ,               "vs_single_NFPw"  ,                
             "votes_single_JRPw",                "vs_single_JRPw" ,                  "rank.vs_single_DPJw",             
            "new.rank.vs_single_DPJw",          "rvs_single_DPJw"  ,                "formerLDP"   ,                    
             "VotesLDP_vp"    ,                  "DPJ_2"   ,                         "DPJ_3" ,                          
             "DPJ_4" ,                           "DPJ_5"    ,                        "DPJ_6",                           
            "VotesDPJ",                         "VotesDPJ_vp" ,                     "sumDPJ_vsharevp") 

elec.dat2 <- elec.dat[myvars]

dat1 <- merge(dat, elec.dat2, by="id", all=T)

saveRDS(dat1, file = "Master_plus_Snow_Turn_Trans_Dis4.rds")

